1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanTransforms.h"
62 #include "llvm/ADT/APInt.h"
63 #include "llvm/ADT/ArrayRef.h"
64 #include "llvm/ADT/DenseMap.h"
65 #include "llvm/ADT/DenseMapInfo.h"
66 #include "llvm/ADT/Hashing.h"
67 #include "llvm/ADT/MapVector.h"
68 #include "llvm/ADT/None.h"
69 #include "llvm/ADT/Optional.h"
70 #include "llvm/ADT/STLExtras.h"
71 #include "llvm/ADT/SmallPtrSet.h"
72 #include "llvm/ADT/SmallSet.h"
73 #include "llvm/ADT/SmallVector.h"
74 #include "llvm/ADT/Statistic.h"
75 #include "llvm/ADT/StringRef.h"
76 #include "llvm/ADT/Twine.h"
77 #include "llvm/ADT/iterator_range.h"
78 #include "llvm/Analysis/AssumptionCache.h"
79 #include "llvm/Analysis/BasicAliasAnalysis.h"
80 #include "llvm/Analysis/BlockFrequencyInfo.h"
81 #include "llvm/Analysis/CFG.h"
82 #include "llvm/Analysis/CodeMetrics.h"
83 #include "llvm/Analysis/DemandedBits.h"
84 #include "llvm/Analysis/GlobalsModRef.h"
85 #include "llvm/Analysis/LoopAccessAnalysis.h"
86 #include "llvm/Analysis/LoopAnalysisManager.h"
87 #include "llvm/Analysis/LoopInfo.h"
88 #include "llvm/Analysis/LoopIterator.h"
89 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
90 #include "llvm/Analysis/ProfileSummaryInfo.h"
91 #include "llvm/Analysis/ScalarEvolution.h"
92 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
93 #include "llvm/Analysis/TargetLibraryInfo.h"
94 #include "llvm/Analysis/TargetTransformInfo.h"
95 #include "llvm/Analysis/ValueTracking.h"
96 #include "llvm/Analysis/VectorUtils.h"
97 #include "llvm/IR/Attributes.h"
98 #include "llvm/IR/BasicBlock.h"
99 #include "llvm/IR/CFG.h"
100 #include "llvm/IR/Constant.h"
101 #include "llvm/IR/Constants.h"
102 #include "llvm/IR/DataLayout.h"
103 #include "llvm/IR/DebugInfoMetadata.h"
104 #include "llvm/IR/DebugLoc.h"
105 #include "llvm/IR/DerivedTypes.h"
106 #include "llvm/IR/DiagnosticInfo.h"
107 #include "llvm/IR/Dominators.h"
108 #include "llvm/IR/Function.h"
109 #include "llvm/IR/IRBuilder.h"
110 #include "llvm/IR/InstrTypes.h"
111 #include "llvm/IR/Instruction.h"
112 #include "llvm/IR/Instructions.h"
113 #include "llvm/IR/IntrinsicInst.h"
114 #include "llvm/IR/Intrinsics.h"
115 #include "llvm/IR/Metadata.h"
116 #include "llvm/IR/Module.h"
117 #include "llvm/IR/Operator.h"
118 #include "llvm/IR/PatternMatch.h"
119 #include "llvm/IR/Type.h"
120 #include "llvm/IR/Use.h"
121 #include "llvm/IR/User.h"
122 #include "llvm/IR/Value.h"
123 #include "llvm/IR/ValueHandle.h"
124 #include "llvm/IR/Verifier.h"
125 #include "llvm/InitializePasses.h"
126 #include "llvm/Pass.h"
127 #include "llvm/Support/Casting.h"
128 #include "llvm/Support/CommandLine.h"
129 #include "llvm/Support/Compiler.h"
130 #include "llvm/Support/Debug.h"
131 #include "llvm/Support/ErrorHandling.h"
132 #include "llvm/Support/InstructionCost.h"
133 #include "llvm/Support/MathExtras.h"
134 #include "llvm/Support/raw_ostream.h"
135 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
136 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
141 #include "llvm/Transforms/Utils/SizeOpts.h"
142 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
143 #include <algorithm>
144 #include <cassert>
145 #include <cstdint>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <map>
150 #include <memory>
151 #include <string>
152 #include <tuple>
153 #include <utility>
154 
155 using namespace llvm;
156 
157 #define LV_NAME "loop-vectorize"
158 #define DEBUG_TYPE LV_NAME
159 
160 #ifndef NDEBUG
161 const char VerboseDebug[] = DEBUG_TYPE "-verbose";
162 #endif
163 
164 /// @{
165 /// Metadata attribute names
166 const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
167 const char LLVMLoopVectorizeFollowupVectorized[] =
168     "llvm.loop.vectorize.followup_vectorized";
169 const char LLVMLoopVectorizeFollowupEpilogue[] =
170     "llvm.loop.vectorize.followup_epilogue";
171 /// @}
172 
173 STATISTIC(LoopsVectorized, "Number of loops vectorized");
174 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175 STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176 
177 static cl::opt<bool> EnableEpilogueVectorization(
178     "enable-epilogue-vectorization", cl::init(true), cl::Hidden,
179     cl::desc("Enable vectorization of epilogue loops."));
180 
181 static cl::opt<unsigned> EpilogueVectorizationForceVF(
182     "epilogue-vectorization-force-VF", cl::init(1), cl::Hidden,
183     cl::desc("When epilogue vectorization is enabled, and a value greater than "
184              "1 is specified, forces the given VF for all applicable epilogue "
185              "loops."));
186 
187 static cl::opt<unsigned> EpilogueVectorizationMinVF(
188     "epilogue-vectorization-minimum-VF", cl::init(16), cl::Hidden,
189     cl::desc("Only loops with vectorization factor equal to or larger than "
190              "the specified value are considered for epilogue vectorization."));
191 
192 /// Loops with a known constant trip count below this number are vectorized only
193 /// if no scalar iteration overheads are incurred.
194 static cl::opt<unsigned> TinyTripCountVectorThreshold(
195     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
196     cl::desc("Loops with a constant trip count that is smaller than this "
197              "value are vectorized only if no scalar iteration overheads "
198              "are incurred."));
199 
200 static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
201     "vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
202     cl::desc("The maximum allowed number of runtime memory checks"));
203 
204 // Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
205 // that predication is preferred, and this lists all options. I.e., the
206 // vectorizer will try to fold the tail-loop (epilogue) into the vector body
207 // and predicate the instructions accordingly. If tail-folding fails, there are
208 // different fallback strategies depending on these values:
209 namespace PreferPredicateTy {
210   enum Option {
211     ScalarEpilogue = 0,
212     PredicateElseScalarEpilogue,
213     PredicateOrDontVectorize
214   };
215 } // namespace PreferPredicateTy
216 
217 static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
218     "prefer-predicate-over-epilogue",
219     cl::init(PreferPredicateTy::ScalarEpilogue),
220     cl::Hidden,
221     cl::desc("Tail-folding and predication preferences over creating a scalar "
222              "epilogue loop."),
223     cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
224                          "scalar-epilogue",
225                          "Don't tail-predicate loops, create scalar epilogue"),
226               clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
227                          "predicate-else-scalar-epilogue",
228                          "prefer tail-folding, create scalar epilogue if tail "
229                          "folding fails."),
230               clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
231                          "predicate-dont-vectorize",
232                          "prefers tail-folding, don't attempt vectorization if "
233                          "tail-folding fails.")));
234 
235 static cl::opt<bool> MaximizeBandwidth(
236     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
237     cl::desc("Maximize bandwidth when selecting vectorization factor which "
238              "will be determined by the smallest type in loop."));
239 
240 static cl::opt<bool> EnableInterleavedMemAccesses(
241     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
242     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
243 
244 /// An interleave-group may need masking if it resides in a block that needs
245 /// predication, or in order to mask away gaps.
246 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
247     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
248     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
249 
250 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
251     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
252     cl::desc("We don't interleave loops with a estimated constant trip count "
253              "below this number"));
254 
255 static cl::opt<unsigned> ForceTargetNumScalarRegs(
256     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
257     cl::desc("A flag that overrides the target's number of scalar registers."));
258 
259 static cl::opt<unsigned> ForceTargetNumVectorRegs(
260     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
261     cl::desc("A flag that overrides the target's number of vector registers."));
262 
263 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
264     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
265     cl::desc("A flag that overrides the target's max interleave factor for "
266              "scalar loops."));
267 
268 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
269     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
270     cl::desc("A flag that overrides the target's max interleave factor for "
271              "vectorized loops."));
272 
273 static cl::opt<unsigned> ForceTargetInstructionCost(
274     "force-target-instruction-cost", cl::init(0), cl::Hidden,
275     cl::desc("A flag that overrides the target's expected cost for "
276              "an instruction to a single constant value. Mostly "
277              "useful for getting consistent testing."));
278 
279 static cl::opt<bool> ForceTargetSupportsScalableVectors(
280     "force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
281     cl::desc(
282         "Pretend that scalable vectors are supported, even if the target does "
283         "not support them. This flag should only be used for testing."));
284 
285 static cl::opt<unsigned> SmallLoopCost(
286     "small-loop-cost", cl::init(20), cl::Hidden,
287     cl::desc(
288         "The cost of a loop that is considered 'small' by the interleaver."));
289 
290 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
291     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
292     cl::desc("Enable the use of the block frequency analysis to access PGO "
293              "heuristics minimizing code growth in cold regions and being more "
294              "aggressive in hot regions."));
295 
296 // Runtime interleave loops for load/store throughput.
297 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
298     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
299     cl::desc(
300         "Enable runtime interleaving until load/store ports are saturated"));
301 
302 /// Interleave small loops with scalar reductions.
303 static cl::opt<bool> InterleaveSmallLoopScalarReduction(
304     "interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
305     cl::desc("Enable interleaving for loops with small iteration counts that "
306              "contain scalar reductions to expose ILP."));
307 
308 /// The number of stores in a loop that are allowed to need predication.
309 static cl::opt<unsigned> NumberOfStoresToPredicate(
310     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
311     cl::desc("Max number of stores to be predicated behind an if."));
312 
313 static cl::opt<bool> EnableIndVarRegisterHeur(
314     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
315     cl::desc("Count the induction variable only once when interleaving"));
316 
317 static cl::opt<bool> EnableCondStoresVectorization(
318     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
319     cl::desc("Enable if predication of stores during vectorization."));
320 
321 static cl::opt<unsigned> MaxNestedScalarReductionIC(
322     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
323     cl::desc("The maximum interleave count to use when interleaving a scalar "
324              "reduction in a nested loop."));
325 
326 static cl::opt<bool>
327     PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
328                            cl::Hidden,
329                            cl::desc("Prefer in-loop vector reductions, "
330                                     "overriding the targets preference."));
331 
332 static cl::opt<bool> ForceOrderedReductions(
333     "force-ordered-reductions", cl::init(false), cl::Hidden,
334     cl::desc("Enable the vectorisation of loops with in-order (strict) "
335              "FP reductions"));
336 
337 static cl::opt<bool> PreferPredicatedReductionSelect(
338     "prefer-predicated-reduction-select", cl::init(false), cl::Hidden,
339     cl::desc(
340         "Prefer predicating a reduction operation over an after loop select."));
341 
342 cl::opt<bool> EnableVPlanNativePath(
343     "enable-vplan-native-path", cl::init(false), cl::Hidden,
344     cl::desc("Enable VPlan-native vectorization path with "
345              "support for outer loop vectorization."));
346 
347 // This flag enables the stress testing of the VPlan H-CFG construction in the
348 // VPlan-native vectorization path. It must be used in conjuction with
349 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
350 // verification of the H-CFGs built.
351 static cl::opt<bool> VPlanBuildStressTest(
352     "vplan-build-stress-test", cl::init(false), cl::Hidden,
353     cl::desc(
354         "Build VPlan for every supported loop nest in the function and bail "
355         "out right after the build (stress test the VPlan H-CFG construction "
356         "in the VPlan-native vectorization path)."));
357 
358 cl::opt<bool> llvm::EnableLoopInterleaving(
359     "interleave-loops", cl::init(true), cl::Hidden,
360     cl::desc("Enable loop interleaving in Loop vectorization passes"));
361 cl::opt<bool> llvm::EnableLoopVectorization(
362     "vectorize-loops", cl::init(true), cl::Hidden,
363     cl::desc("Run the Loop vectorization passes"));
364 
365 cl::opt<bool> PrintVPlansInDotFormat(
366     "vplan-print-in-dot-format", cl::init(false), cl::Hidden,
367     cl::desc("Use dot format instead of plain text when dumping VPlans"));
368 
369 /// A helper function that returns true if the given type is irregular. The
370 /// type is irregular if its allocated size doesn't equal the store size of an
371 /// element of the corresponding vector type.
372 static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
373   // Determine if an array of N elements of type Ty is "bitcast compatible"
374   // with a <N x Ty> vector.
375   // This is only true if there is no padding between the array elements.
376   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
377 }
378 
379 /// A helper function that returns the reciprocal of the block probability of
380 /// predicated blocks. If we return X, we are assuming the predicated block
381 /// will execute once for every X iterations of the loop header.
382 ///
383 /// TODO: We should use actual block probability here, if available. Currently,
384 ///       we always assume predicated blocks have a 50% chance of executing.
385 static unsigned getReciprocalPredBlockProb() { return 2; }
386 
387 /// A helper function that returns an integer or floating-point constant with
388 /// value C.
389 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
390   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
391                            : ConstantFP::get(Ty, C);
392 }
393 
394 /// Returns "best known" trip count for the specified loop \p L as defined by
395 /// the following procedure:
396 ///   1) Returns exact trip count if it is known.
397 ///   2) Returns expected trip count according to profile data if any.
398 ///   3) Returns upper bound estimate if it is known.
399 ///   4) Returns None if all of the above failed.
400 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
401   // Check if exact trip count is known.
402   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
403     return ExpectedTC;
404 
405   // Check if there is an expected trip count available from profile data.
406   if (LoopVectorizeWithBlockFrequency)
407     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
408       return EstimatedTC;
409 
410   // Check if upper bound estimate is known.
411   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
412     return ExpectedTC;
413 
414   return None;
415 }
416 
417 // Forward declare GeneratedRTChecks.
418 class GeneratedRTChecks;
419 
420 namespace llvm {
421 
422 AnalysisKey ShouldRunExtraVectorPasses::Key;
423 
424 /// InnerLoopVectorizer vectorizes loops which contain only one basic
425 /// block to a specified vectorization factor (VF).
426 /// This class performs the widening of scalars into vectors, or multiple
427 /// scalars. This class also implements the following features:
428 /// * It inserts an epilogue loop for handling loops that don't have iteration
429 ///   counts that are known to be a multiple of the vectorization factor.
430 /// * It handles the code generation for reduction variables.
431 /// * Scalarization (implementation using scalars) of un-vectorizable
432 ///   instructions.
433 /// InnerLoopVectorizer does not perform any vectorization-legality
434 /// checks, and relies on the caller to check for the different legality
435 /// aspects. The InnerLoopVectorizer relies on the
436 /// LoopVectorizationLegality class to provide information about the induction
437 /// and reduction variables that were found to a given vectorization factor.
438 class InnerLoopVectorizer {
439 public:
440   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
441                       LoopInfo *LI, DominatorTree *DT,
442                       const TargetLibraryInfo *TLI,
443                       const TargetTransformInfo *TTI, AssumptionCache *AC,
444                       OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
445                       ElementCount MinProfitableTripCount,
446                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
447                       LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
448                       ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
449       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
450         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
451         Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
452         PSI(PSI), RTChecks(RTChecks) {
453     // Query this against the original loop and save it here because the profile
454     // of the original loop header may change as the transformation happens.
455     OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize(
456         OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
457 
458     if (MinProfitableTripCount.isZero())
459       this->MinProfitableTripCount = VecWidth;
460     else
461       this->MinProfitableTripCount = MinProfitableTripCount;
462   }
463 
464   virtual ~InnerLoopVectorizer() = default;
465 
466   /// Create a new empty loop that will contain vectorized instructions later
467   /// on, while the old loop will be used as the scalar remainder. Control flow
468   /// is generated around the vectorized (and scalar epilogue) loops consisting
469   /// of various checks and bypasses. Return the pre-header block of the new
470   /// loop and the start value for the canonical induction, if it is != 0. The
471   /// latter is the case when vectorizing the epilogue loop. In the case of
472   /// epilogue vectorization, this function is overriden to handle the more
473   /// complex control flow around the loops.
474   virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
475 
476   /// Widen a single call instruction within the innermost loop.
477   void widenCallInstruction(CallInst &CI, VPValue *Def, VPUser &ArgOperands,
478                             VPTransformState &State);
479 
480   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
481   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
482 
483   // Return true if any runtime check is added.
484   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
485 
486   /// A type for vectorized values in the new loop. Each value from the
487   /// original loop, when vectorized, is represented by UF vector values in the
488   /// new unrolled loop, where UF is the unroll factor.
489   using VectorParts = SmallVector<Value *, 2>;
490 
491   /// A helper function to scalarize a single Instruction in the innermost loop.
492   /// Generates a sequence of scalar instances for each lane between \p MinLane
493   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
494   /// inclusive. Uses the VPValue operands from \p RepRecipe instead of \p
495   /// Instr's operands.
496   void scalarizeInstruction(Instruction *Instr, VPReplicateRecipe *RepRecipe,
497                             const VPIteration &Instance, bool IfPredicateInstr,
498                             VPTransformState &State);
499 
500   /// Construct the vector value of a scalarized value \p V one lane at a time.
501   void packScalarIntoVectorValue(VPValue *Def, const VPIteration &Instance,
502                                  VPTransformState &State);
503 
504   /// Try to vectorize interleaved access group \p Group with the base address
505   /// given in \p Addr, optionally masking the vector operations if \p
506   /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
507   /// values in the vectorized loop.
508   void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
509                                 ArrayRef<VPValue *> VPDefs,
510                                 VPTransformState &State, VPValue *Addr,
511                                 ArrayRef<VPValue *> StoredValues,
512                                 VPValue *BlockInMask = nullptr);
513 
514   /// Fix the non-induction PHIs in \p Plan.
515   void fixNonInductionPHIs(VPlan &Plan, VPTransformState &State);
516 
517   /// Returns true if the reordering of FP operations is not allowed, but we are
518   /// able to vectorize with strict in-order reductions for the given RdxDesc.
519   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc);
520 
521   /// Create a broadcast instruction. This method generates a broadcast
522   /// instruction (shuffle) for loop invariant values and for the induction
523   /// value. If this is the induction variable then we extend it to N, N+1, ...
524   /// this is needed because each iteration in the loop corresponds to a SIMD
525   /// element.
526   virtual Value *getBroadcastInstrs(Value *V);
527 
528   // Returns the resume value (bc.merge.rdx) for a reduction as
529   // generated by fixReduction.
530   PHINode *getReductionResumeValue(const RecurrenceDescriptor &RdxDesc);
531 
532 protected:
533   friend class LoopVectorizationPlanner;
534 
535   /// A small list of PHINodes.
536   using PhiVector = SmallVector<PHINode *, 4>;
537 
538   /// A type for scalarized values in the new loop. Each value from the
539   /// original loop, when scalarized, is represented by UF x VF scalar values
540   /// in the new unrolled loop, where UF is the unroll factor and VF is the
541   /// vectorization factor.
542   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
543 
544   /// Set up the values of the IVs correctly when exiting the vector loop.
545   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
546                     Value *VectorTripCount, Value *EndValue,
547                     BasicBlock *MiddleBlock, BasicBlock *VectorHeader,
548                     VPlan &Plan);
549 
550   /// Handle all cross-iteration phis in the header.
551   void fixCrossIterationPHIs(VPTransformState &State);
552 
553   /// Create the exit value of first order recurrences in the middle block and
554   /// update their users.
555   void fixFirstOrderRecurrence(VPFirstOrderRecurrencePHIRecipe *PhiR,
556                                VPTransformState &State);
557 
558   /// Create code for the loop exit value of the reduction.
559   void fixReduction(VPReductionPHIRecipe *Phi, VPTransformState &State);
560 
561   /// Clear NSW/NUW flags from reduction instructions if necessary.
562   void clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
563                                VPTransformState &State);
564 
565   /// Iteratively sink the scalarized operands of a predicated instruction into
566   /// the block that was created for it.
567   void sinkScalarOperands(Instruction *PredInst);
568 
569   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
570   /// represented as.
571   void truncateToMinimalBitwidths(VPTransformState &State);
572 
573   /// Returns (and creates if needed) the original loop trip count.
574   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
575 
576   /// Returns (and creates if needed) the trip count of the widened loop.
577   Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
578 
579   /// Returns a bitcasted value to the requested vector type.
580   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
581   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
582                                 const DataLayout &DL);
583 
584   /// Emit a bypass check to see if the vector trip count is zero, including if
585   /// it overflows.
586   void emitIterationCountCheck(BasicBlock *Bypass);
587 
588   /// Emit a bypass check to see if all of the SCEV assumptions we've
589   /// had to make are correct. Returns the block containing the checks or
590   /// nullptr if no checks have been added.
591   BasicBlock *emitSCEVChecks(BasicBlock *Bypass);
592 
593   /// Emit bypass checks to check any memory assumptions we may have made.
594   /// Returns the block containing the checks or nullptr if no checks have been
595   /// added.
596   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass);
597 
598   /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
599   /// vector loop preheader, middle block and scalar preheader.
600   void createVectorLoopSkeleton(StringRef Prefix);
601 
602   /// Create new phi nodes for the induction variables to resume iteration count
603   /// in the scalar epilogue, from where the vectorized loop left off.
604   /// In cases where the loop skeleton is more complicated (eg. epilogue
605   /// vectorization) and the resume values can come from an additional bypass
606   /// block, the \p AdditionalBypass pair provides information about the bypass
607   /// block and the end value on the edge from bypass to this loop.
608   void createInductionResumeValues(
609       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
610 
611   /// Complete the loop skeleton by adding debug MDs, creating appropriate
612   /// conditional branches in the middle block, preparing the builder and
613   /// running the verifier. Return the preheader of the completed vector loop.
614   BasicBlock *completeLoopSkeleton(MDNode *OrigLoopID);
615 
616   /// Collect poison-generating recipes that may generate a poison value that is
617   /// used after vectorization, even when their operands are not poison. Those
618   /// recipes meet the following conditions:
619   ///  * Contribute to the address computation of a recipe generating a widen
620   ///    memory load/store (VPWidenMemoryInstructionRecipe or
621   ///    VPInterleaveRecipe).
622   ///  * Such a widen memory load/store has at least one underlying Instruction
623   ///    that is in a basic block that needs predication and after vectorization
624   ///    the generated instruction won't be predicated.
625   void collectPoisonGeneratingRecipes(VPTransformState &State);
626 
627   /// Allow subclasses to override and print debug traces before/after vplan
628   /// execution, when trace information is requested.
629   virtual void printDebugTracesAtStart(){};
630   virtual void printDebugTracesAtEnd(){};
631 
632   /// The original loop.
633   Loop *OrigLoop;
634 
635   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
636   /// dynamic knowledge to simplify SCEV expressions and converts them to a
637   /// more usable form.
638   PredicatedScalarEvolution &PSE;
639 
640   /// Loop Info.
641   LoopInfo *LI;
642 
643   /// Dominator Tree.
644   DominatorTree *DT;
645 
646   /// Alias Analysis.
647   AAResults *AA;
648 
649   /// Target Library Info.
650   const TargetLibraryInfo *TLI;
651 
652   /// Target Transform Info.
653   const TargetTransformInfo *TTI;
654 
655   /// Assumption Cache.
656   AssumptionCache *AC;
657 
658   /// Interface to emit optimization remarks.
659   OptimizationRemarkEmitter *ORE;
660 
661   /// The vectorization SIMD factor to use. Each vector will have this many
662   /// vector elements.
663   ElementCount VF;
664 
665   ElementCount MinProfitableTripCount;
666 
667   /// The vectorization unroll factor to use. Each scalar is vectorized to this
668   /// many different vector instructions.
669   unsigned UF;
670 
671   /// The builder that we use
672   IRBuilder<> Builder;
673 
674   // --- Vectorization state ---
675 
676   /// The vector-loop preheader.
677   BasicBlock *LoopVectorPreHeader;
678 
679   /// The scalar-loop preheader.
680   BasicBlock *LoopScalarPreHeader;
681 
682   /// Middle Block between the vector and the scalar.
683   BasicBlock *LoopMiddleBlock;
684 
685   /// The unique ExitBlock of the scalar loop if one exists.  Note that
686   /// there can be multiple exiting edges reaching this block.
687   BasicBlock *LoopExitBlock;
688 
689   /// The scalar loop body.
690   BasicBlock *LoopScalarBody;
691 
692   /// A list of all bypass blocks. The first block is the entry of the loop.
693   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
694 
695   /// Store instructions that were predicated.
696   SmallVector<Instruction *, 4> PredicatedInstructions;
697 
698   /// Trip count of the original loop.
699   Value *TripCount = nullptr;
700 
701   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
702   Value *VectorTripCount = nullptr;
703 
704   /// The legality analysis.
705   LoopVectorizationLegality *Legal;
706 
707   /// The profitablity analysis.
708   LoopVectorizationCostModel *Cost;
709 
710   // Record whether runtime checks are added.
711   bool AddedSafetyChecks = false;
712 
713   // Holds the end values for each induction variable. We save the end values
714   // so we can later fix-up the external users of the induction variables.
715   DenseMap<PHINode *, Value *> IVEndValues;
716 
717   /// BFI and PSI are used to check for profile guided size optimizations.
718   BlockFrequencyInfo *BFI;
719   ProfileSummaryInfo *PSI;
720 
721   // Whether this loop should be optimized for size based on profile guided size
722   // optimizatios.
723   bool OptForSizeBasedOnProfile;
724 
725   /// Structure to hold information about generated runtime checks, responsible
726   /// for cleaning the checks, if vectorization turns out unprofitable.
727   GeneratedRTChecks &RTChecks;
728 
729   // Holds the resume values for reductions in the loops, used to set the
730   // correct start value of reduction PHIs when vectorizing the epilogue.
731   SmallMapVector<const RecurrenceDescriptor *, PHINode *, 4>
732       ReductionResumeValues;
733 };
734 
735 class InnerLoopUnroller : public InnerLoopVectorizer {
736 public:
737   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
738                     LoopInfo *LI, DominatorTree *DT,
739                     const TargetLibraryInfo *TLI,
740                     const TargetTransformInfo *TTI, AssumptionCache *AC,
741                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
742                     LoopVectorizationLegality *LVL,
743                     LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
744                     ProfileSummaryInfo *PSI, GeneratedRTChecks &Check)
745       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
746                             ElementCount::getFixed(1),
747                             ElementCount::getFixed(1), UnrollFactor, LVL, CM,
748                             BFI, PSI, Check) {}
749 
750 private:
751   Value *getBroadcastInstrs(Value *V) override;
752 };
753 
754 /// Encapsulate information regarding vectorization of a loop and its epilogue.
755 /// This information is meant to be updated and used across two stages of
756 /// epilogue vectorization.
757 struct EpilogueLoopVectorizationInfo {
758   ElementCount MainLoopVF = ElementCount::getFixed(0);
759   unsigned MainLoopUF = 0;
760   ElementCount EpilogueVF = ElementCount::getFixed(0);
761   unsigned EpilogueUF = 0;
762   BasicBlock *MainLoopIterationCountCheck = nullptr;
763   BasicBlock *EpilogueIterationCountCheck = nullptr;
764   BasicBlock *SCEVSafetyCheck = nullptr;
765   BasicBlock *MemSafetyCheck = nullptr;
766   Value *TripCount = nullptr;
767   Value *VectorTripCount = nullptr;
768 
769   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
770                                 ElementCount EVF, unsigned EUF)
771       : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
772     assert(EUF == 1 &&
773            "A high UF for the epilogue loop is likely not beneficial.");
774   }
775 };
776 
777 /// An extension of the inner loop vectorizer that creates a skeleton for a
778 /// vectorized loop that has its epilogue (residual) also vectorized.
779 /// The idea is to run the vplan on a given loop twice, firstly to setup the
780 /// skeleton and vectorize the main loop, and secondly to complete the skeleton
781 /// from the first step and vectorize the epilogue.  This is achieved by
782 /// deriving two concrete strategy classes from this base class and invoking
783 /// them in succession from the loop vectorizer planner.
784 class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
785 public:
786   InnerLoopAndEpilogueVectorizer(
787       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
788       DominatorTree *DT, const TargetLibraryInfo *TLI,
789       const TargetTransformInfo *TTI, AssumptionCache *AC,
790       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
791       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
792       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
793       GeneratedRTChecks &Checks)
794       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
795                             EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
796                             CM, BFI, PSI, Checks),
797         EPI(EPI) {}
798 
799   // Override this function to handle the more complex control flow around the
800   // three loops.
801   std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
802     return createEpilogueVectorizedLoopSkeleton();
803   }
804 
805   /// The interface for creating a vectorized skeleton using one of two
806   /// different strategies, each corresponding to one execution of the vplan
807   /// as described above.
808   virtual std::pair<BasicBlock *, Value *>
809   createEpilogueVectorizedLoopSkeleton() = 0;
810 
811   /// Holds and updates state information required to vectorize the main loop
812   /// and its epilogue in two separate passes. This setup helps us avoid
813   /// regenerating and recomputing runtime safety checks. It also helps us to
814   /// shorten the iteration-count-check path length for the cases where the
815   /// iteration count of the loop is so small that the main vector loop is
816   /// completely skipped.
817   EpilogueLoopVectorizationInfo &EPI;
818 };
819 
820 /// A specialized derived class of inner loop vectorizer that performs
821 /// vectorization of *main* loops in the process of vectorizing loops and their
822 /// epilogues.
823 class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
824 public:
825   EpilogueVectorizerMainLoop(
826       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
827       DominatorTree *DT, const TargetLibraryInfo *TLI,
828       const TargetTransformInfo *TTI, AssumptionCache *AC,
829       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
830       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
831       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
832       GeneratedRTChecks &Check)
833       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
834                                        EPI, LVL, CM, BFI, PSI, Check) {}
835   /// Implements the interface for creating a vectorized skeleton using the
836   /// *main loop* strategy (ie the first pass of vplan execution).
837   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
838 
839 protected:
840   /// Emits an iteration count bypass check once for the main loop (when \p
841   /// ForEpilogue is false) and once for the epilogue loop (when \p
842   /// ForEpilogue is true).
843   BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
844   void printDebugTracesAtStart() override;
845   void printDebugTracesAtEnd() override;
846 };
847 
848 // A specialized derived class of inner loop vectorizer that performs
849 // vectorization of *epilogue* loops in the process of vectorizing loops and
850 // their epilogues.
851 class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
852 public:
853   EpilogueVectorizerEpilogueLoop(
854       Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
855       DominatorTree *DT, const TargetLibraryInfo *TLI,
856       const TargetTransformInfo *TTI, AssumptionCache *AC,
857       OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
858       LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
859       BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
860       GeneratedRTChecks &Checks)
861       : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
862                                        EPI, LVL, CM, BFI, PSI, Checks) {
863     TripCount = EPI.TripCount;
864   }
865   /// Implements the interface for creating a vectorized skeleton using the
866   /// *epilogue loop* strategy (ie the second pass of vplan execution).
867   std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
868 
869 protected:
870   /// Emits an iteration count bypass check after the main vector loop has
871   /// finished to see if there are any iterations left to execute by either
872   /// the vector epilogue or the scalar epilogue.
873   BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
874                                                       BasicBlock *Bypass,
875                                                       BasicBlock *Insert);
876   void printDebugTracesAtStart() override;
877   void printDebugTracesAtEnd() override;
878 };
879 } // end namespace llvm
880 
881 /// Look for a meaningful debug location on the instruction or it's
882 /// operands.
883 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
884   if (!I)
885     return I;
886 
887   DebugLoc Empty;
888   if (I->getDebugLoc() != Empty)
889     return I;
890 
891   for (Use &Op : I->operands()) {
892     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
893       if (OpInst->getDebugLoc() != Empty)
894         return OpInst;
895   }
896 
897   return I;
898 }
899 
900 /// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
901 /// is passed, the message relates to that particular instruction.
902 #ifndef NDEBUG
903 static void debugVectorizationMessage(const StringRef Prefix,
904                                       const StringRef DebugMsg,
905                                       Instruction *I) {
906   dbgs() << "LV: " << Prefix << DebugMsg;
907   if (I != nullptr)
908     dbgs() << " " << *I;
909   else
910     dbgs() << '.';
911   dbgs() << '\n';
912 }
913 #endif
914 
915 /// Create an analysis remark that explains why vectorization failed
916 ///
917 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
918 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
919 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
920 /// the location of the remark.  \return the remark object that can be
921 /// streamed to.
922 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
923     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
924   Value *CodeRegion = TheLoop->getHeader();
925   DebugLoc DL = TheLoop->getStartLoc();
926 
927   if (I) {
928     CodeRegion = I->getParent();
929     // If there is no debug location attached to the instruction, revert back to
930     // using the loop's.
931     if (I->getDebugLoc())
932       DL = I->getDebugLoc();
933   }
934 
935   return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
936 }
937 
938 namespace llvm {
939 
940 /// Return a value for Step multiplied by VF.
941 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
942                        int64_t Step) {
943   assert(Ty->isIntegerTy() && "Expected an integer step");
944   Constant *StepVal = ConstantInt::get(Ty, Step * VF.getKnownMinValue());
945   return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
946 }
947 
948 /// Return the runtime value for VF.
949 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
950   Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
951   return VF.isScalable() ? B.CreateVScale(EC) : EC;
952 }
953 
954 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
955                                   ElementCount VF) {
956   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
957   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
958   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
959   return B.CreateUIToFP(RuntimeVF, FTy);
960 }
961 
962 void reportVectorizationFailure(const StringRef DebugMsg,
963                                 const StringRef OREMsg, const StringRef ORETag,
964                                 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
965                                 Instruction *I) {
966   LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
967   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
968   ORE->emit(
969       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
970       << "loop not vectorized: " << OREMsg);
971 }
972 
973 void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
974                              OptimizationRemarkEmitter *ORE, Loop *TheLoop,
975                              Instruction *I) {
976   LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
977   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
978   ORE->emit(
979       createLVAnalysis(Hints.vectorizeAnalysisPassName(), ORETag, TheLoop, I)
980       << Msg);
981 }
982 
983 } // end namespace llvm
984 
985 #ifndef NDEBUG
986 /// \return string containing a file name and a line # for the given loop.
987 static std::string getDebugLocString(const Loop *L) {
988   std::string Result;
989   if (L) {
990     raw_string_ostream OS(Result);
991     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
992       LoopDbgLoc.print(OS);
993     else
994       // Just print the module name.
995       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
996     OS.flush();
997   }
998   return Result;
999 }
1000 #endif
1001 
1002 void InnerLoopVectorizer::collectPoisonGeneratingRecipes(
1003     VPTransformState &State) {
1004 
1005   // Collect recipes in the backward slice of `Root` that may generate a poison
1006   // value that is used after vectorization.
1007   SmallPtrSet<VPRecipeBase *, 16> Visited;
1008   auto collectPoisonGeneratingInstrsInBackwardSlice([&](VPRecipeBase *Root) {
1009     SmallVector<VPRecipeBase *, 16> Worklist;
1010     Worklist.push_back(Root);
1011 
1012     // Traverse the backward slice of Root through its use-def chain.
1013     while (!Worklist.empty()) {
1014       VPRecipeBase *CurRec = Worklist.back();
1015       Worklist.pop_back();
1016 
1017       if (!Visited.insert(CurRec).second)
1018         continue;
1019 
1020       // Prune search if we find another recipe generating a widen memory
1021       // instruction. Widen memory instructions involved in address computation
1022       // will lead to gather/scatter instructions, which don't need to be
1023       // handled.
1024       if (isa<VPWidenMemoryInstructionRecipe>(CurRec) ||
1025           isa<VPInterleaveRecipe>(CurRec) ||
1026           isa<VPScalarIVStepsRecipe>(CurRec) ||
1027           isa<VPCanonicalIVPHIRecipe>(CurRec) ||
1028           isa<VPActiveLaneMaskPHIRecipe>(CurRec))
1029         continue;
1030 
1031       // This recipe contributes to the address computation of a widen
1032       // load/store. Collect recipe if its underlying instruction has
1033       // poison-generating flags.
1034       Instruction *Instr = CurRec->getUnderlyingInstr();
1035       if (Instr && Instr->hasPoisonGeneratingFlags())
1036         State.MayGeneratePoisonRecipes.insert(CurRec);
1037 
1038       // Add new definitions to the worklist.
1039       for (VPValue *operand : CurRec->operands())
1040         if (VPDef *OpDef = operand->getDef())
1041           Worklist.push_back(cast<VPRecipeBase>(OpDef));
1042     }
1043   });
1044 
1045   // Traverse all the recipes in the VPlan and collect the poison-generating
1046   // recipes in the backward slice starting at the address of a VPWidenRecipe or
1047   // VPInterleaveRecipe.
1048   auto Iter = depth_first(
1049       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(State.Plan->getEntry()));
1050   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
1051     for (VPRecipeBase &Recipe : *VPBB) {
1052       if (auto *WidenRec = dyn_cast<VPWidenMemoryInstructionRecipe>(&Recipe)) {
1053         Instruction &UnderlyingInstr = WidenRec->getIngredient();
1054         VPDef *AddrDef = WidenRec->getAddr()->getDef();
1055         if (AddrDef && WidenRec->isConsecutive() &&
1056             Legal->blockNeedsPredication(UnderlyingInstr.getParent()))
1057           collectPoisonGeneratingInstrsInBackwardSlice(
1058               cast<VPRecipeBase>(AddrDef));
1059       } else if (auto *InterleaveRec = dyn_cast<VPInterleaveRecipe>(&Recipe)) {
1060         VPDef *AddrDef = InterleaveRec->getAddr()->getDef();
1061         if (AddrDef) {
1062           // Check if any member of the interleave group needs predication.
1063           const InterleaveGroup<Instruction> *InterGroup =
1064               InterleaveRec->getInterleaveGroup();
1065           bool NeedPredication = false;
1066           for (int I = 0, NumMembers = InterGroup->getNumMembers();
1067                I < NumMembers; ++I) {
1068             Instruction *Member = InterGroup->getMember(I);
1069             if (Member)
1070               NeedPredication |=
1071                   Legal->blockNeedsPredication(Member->getParent());
1072           }
1073 
1074           if (NeedPredication)
1075             collectPoisonGeneratingInstrsInBackwardSlice(
1076                 cast<VPRecipeBase>(AddrDef));
1077         }
1078       }
1079     }
1080   }
1081 }
1082 
1083 PHINode *InnerLoopVectorizer::getReductionResumeValue(
1084     const RecurrenceDescriptor &RdxDesc) {
1085   auto It = ReductionResumeValues.find(&RdxDesc);
1086   assert(It != ReductionResumeValues.end() &&
1087          "Expected to find a resume value for the reduction.");
1088   return It->second;
1089 }
1090 
1091 namespace llvm {
1092 
1093 // Loop vectorization cost-model hints how the scalar epilogue loop should be
1094 // lowered.
1095 enum ScalarEpilogueLowering {
1096 
1097   // The default: allowing scalar epilogues.
1098   CM_ScalarEpilogueAllowed,
1099 
1100   // Vectorization with OptForSize: don't allow epilogues.
1101   CM_ScalarEpilogueNotAllowedOptSize,
1102 
1103   // A special case of vectorisation with OptForSize: loops with a very small
1104   // trip count are considered for vectorization under OptForSize, thereby
1105   // making sure the cost of their loop body is dominant, free of runtime
1106   // guards and scalar iteration overheads.
1107   CM_ScalarEpilogueNotAllowedLowTripLoop,
1108 
1109   // Loop hint predicate indicating an epilogue is undesired.
1110   CM_ScalarEpilogueNotNeededUsePredicate,
1111 
1112   // Directive indicating we must either tail fold or not vectorize
1113   CM_ScalarEpilogueNotAllowedUsePredicate
1114 };
1115 
1116 /// ElementCountComparator creates a total ordering for ElementCount
1117 /// for the purposes of using it in a set structure.
1118 struct ElementCountComparator {
1119   bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
1120     return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
1121            std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
1122   }
1123 };
1124 using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
1125 
1126 /// LoopVectorizationCostModel - estimates the expected speedups due to
1127 /// vectorization.
1128 /// In many cases vectorization is not profitable. This can happen because of
1129 /// a number of reasons. In this class we mainly attempt to predict the
1130 /// expected speedup/slowdowns due to the supported instruction set. We use the
1131 /// TargetTransformInfo to query the different backends for the cost of
1132 /// different operations.
1133 class LoopVectorizationCostModel {
1134 public:
1135   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
1136                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
1137                              LoopVectorizationLegality *Legal,
1138                              const TargetTransformInfo &TTI,
1139                              const TargetLibraryInfo *TLI, DemandedBits *DB,
1140                              AssumptionCache *AC,
1141                              OptimizationRemarkEmitter *ORE, const Function *F,
1142                              const LoopVectorizeHints *Hints,
1143                              InterleavedAccessInfo &IAI)
1144       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
1145         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
1146         Hints(Hints), InterleaveInfo(IAI) {}
1147 
1148   /// \return An upper bound for the vectorization factors (both fixed and
1149   /// scalable). If the factors are 0, vectorization and interleaving should be
1150   /// avoided up front.
1151   FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
1152 
1153   /// \return True if runtime checks are required for vectorization, and false
1154   /// otherwise.
1155   bool runtimeChecksRequired();
1156 
1157   /// \return The most profitable vectorization factor and the cost of that VF.
1158   /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
1159   /// then this vectorization factor will be selected if vectorization is
1160   /// possible.
1161   VectorizationFactor
1162   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
1163 
1164   VectorizationFactor
1165   selectEpilogueVectorizationFactor(const ElementCount MaxVF,
1166                                     const LoopVectorizationPlanner &LVP);
1167 
1168   /// Setup cost-based decisions for user vectorization factor.
1169   /// \return true if the UserVF is a feasible VF to be chosen.
1170   bool selectUserVectorizationFactor(ElementCount UserVF) {
1171     collectUniformsAndScalars(UserVF);
1172     collectInstsToScalarize(UserVF);
1173     return expectedCost(UserVF).first.isValid();
1174   }
1175 
1176   /// \return The size (in bits) of the smallest and widest types in the code
1177   /// that needs to be vectorized. We ignore values that remain scalar such as
1178   /// 64 bit loop indices.
1179   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
1180 
1181   /// \return The desired interleave count.
1182   /// If interleave count has been specified by metadata it will be returned.
1183   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1184   /// are the selected vectorization factor and the cost of the selected VF.
1185   unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
1186 
1187   /// Memory access instruction may be vectorized in more than one way.
1188   /// Form of instruction after vectorization depends on cost.
1189   /// This function takes cost-based decisions for Load/Store instructions
1190   /// and collects them in a map. This decisions map is used for building
1191   /// the lists of loop-uniform and loop-scalar instructions.
1192   /// The calculated cost is saved with widening decision in order to
1193   /// avoid redundant calculations.
1194   void setCostBasedWideningDecision(ElementCount VF);
1195 
1196   /// A struct that represents some properties of the register usage
1197   /// of a loop.
1198   struct RegisterUsage {
1199     /// Holds the number of loop invariant values that are used in the loop.
1200     /// The key is ClassID of target-provided register class.
1201     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1202     /// Holds the maximum number of concurrent live intervals in the loop.
1203     /// The key is ClassID of target-provided register class.
1204     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1205   };
1206 
1207   /// \return Returns information about the register usages of the loop for the
1208   /// given vectorization factors.
1209   SmallVector<RegisterUsage, 8>
1210   calculateRegisterUsage(ArrayRef<ElementCount> VFs);
1211 
1212   /// Collect values we want to ignore in the cost model.
1213   void collectValuesToIgnore();
1214 
1215   /// Collect all element types in the loop for which widening is needed.
1216   void collectElementTypesForWidening();
1217 
1218   /// Split reductions into those that happen in the loop, and those that happen
1219   /// outside. In loop reductions are collected into InLoopReductionChains.
1220   void collectInLoopReductions();
1221 
1222   /// Returns true if we should use strict in-order reductions for the given
1223   /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
1224   /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
1225   /// of FP operations.
1226   bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
1227     return !Hints->allowReordering() && RdxDesc.isOrdered();
1228   }
1229 
1230   /// \returns The smallest bitwidth each instruction can be represented with.
1231   /// The vector equivalents of these instructions should be truncated to this
1232   /// type.
1233   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1234     return MinBWs;
1235   }
1236 
1237   /// \returns True if it is more profitable to scalarize instruction \p I for
1238   /// vectorization factor \p VF.
1239   bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1240     assert(VF.isVector() &&
1241            "Profitable to scalarize relevant only for VF > 1.");
1242 
1243     // Cost model is not run in the VPlan-native path - return conservative
1244     // result until this changes.
1245     if (EnableVPlanNativePath)
1246       return false;
1247 
1248     auto Scalars = InstsToScalarize.find(VF);
1249     assert(Scalars != InstsToScalarize.end() &&
1250            "VF not yet analyzed for scalarization profitability");
1251     return Scalars->second.find(I) != Scalars->second.end();
1252   }
1253 
1254   /// Returns true if \p I is known to be uniform after vectorization.
1255   bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1256     if (VF.isScalar())
1257       return true;
1258 
1259     // Cost model is not run in the VPlan-native path - return conservative
1260     // result until this changes.
1261     if (EnableVPlanNativePath)
1262       return false;
1263 
1264     auto UniformsPerVF = Uniforms.find(VF);
1265     assert(UniformsPerVF != Uniforms.end() &&
1266            "VF not yet analyzed for uniformity");
1267     return UniformsPerVF->second.count(I);
1268   }
1269 
1270   /// Returns true if \p I is known to be scalar after vectorization.
1271   bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1272     if (VF.isScalar())
1273       return true;
1274 
1275     // Cost model is not run in the VPlan-native path - return conservative
1276     // result until this changes.
1277     if (EnableVPlanNativePath)
1278       return false;
1279 
1280     auto ScalarsPerVF = Scalars.find(VF);
1281     assert(ScalarsPerVF != Scalars.end() &&
1282            "Scalar values are not calculated for VF");
1283     return ScalarsPerVF->second.count(I);
1284   }
1285 
1286   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1287   /// for vectorization factor \p VF.
1288   bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1289     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
1290            !isProfitableToScalarize(I, VF) &&
1291            !isScalarAfterVectorization(I, VF);
1292   }
1293 
1294   /// Decision that was taken during cost calculation for memory instruction.
1295   enum InstWidening {
1296     CM_Unknown,
1297     CM_Widen,         // For consecutive accesses with stride +1.
1298     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1299     CM_Interleave,
1300     CM_GatherScatter,
1301     CM_Scalarize
1302   };
1303 
1304   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1305   /// instruction \p I and vector width \p VF.
1306   void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1307                            InstructionCost Cost) {
1308     assert(VF.isVector() && "Expected VF >=2");
1309     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1310   }
1311 
1312   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1313   /// interleaving group \p Grp and vector width \p VF.
1314   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1315                            ElementCount VF, InstWidening W,
1316                            InstructionCost Cost) {
1317     assert(VF.isVector() && "Expected VF >=2");
1318     /// Broadcast this decicion to all instructions inside the group.
1319     /// But the cost will be assigned to one instruction only.
1320     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1321       if (auto *I = Grp->getMember(i)) {
1322         if (Grp->getInsertPos() == I)
1323           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1324         else
1325           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1326       }
1327     }
1328   }
1329 
1330   /// Return the cost model decision for the given instruction \p I and vector
1331   /// width \p VF. Return CM_Unknown if this instruction did not pass
1332   /// through the cost modeling.
1333   InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1334     assert(VF.isVector() && "Expected VF to be a vector VF");
1335     // Cost model is not run in the VPlan-native path - return conservative
1336     // result until this changes.
1337     if (EnableVPlanNativePath)
1338       return CM_GatherScatter;
1339 
1340     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1341     auto Itr = WideningDecisions.find(InstOnVF);
1342     if (Itr == WideningDecisions.end())
1343       return CM_Unknown;
1344     return Itr->second.first;
1345   }
1346 
1347   /// Return the vectorization cost for the given instruction \p I and vector
1348   /// width \p VF.
1349   InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1350     assert(VF.isVector() && "Expected VF >=2");
1351     std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
1352     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1353            "The cost is not calculated");
1354     return WideningDecisions[InstOnVF].second;
1355   }
1356 
1357   /// Return True if instruction \p I is an optimizable truncate whose operand
1358   /// is an induction variable. Such a truncate will be removed by adding a new
1359   /// induction variable with the destination type.
1360   bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1361     // If the instruction is not a truncate, return false.
1362     auto *Trunc = dyn_cast<TruncInst>(I);
1363     if (!Trunc)
1364       return false;
1365 
1366     // Get the source and destination types of the truncate.
1367     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1368     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1369 
1370     // If the truncate is free for the given types, return false. Replacing a
1371     // free truncate with an induction variable would add an induction variable
1372     // update instruction to each iteration of the loop. We exclude from this
1373     // check the primary induction variable since it will need an update
1374     // instruction regardless.
1375     Value *Op = Trunc->getOperand(0);
1376     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1377       return false;
1378 
1379     // If the truncated value is not an induction variable, return false.
1380     return Legal->isInductionPhi(Op);
1381   }
1382 
1383   /// Collects the instructions to scalarize for each predicated instruction in
1384   /// the loop.
1385   void collectInstsToScalarize(ElementCount VF);
1386 
1387   /// Collect Uniform and Scalar values for the given \p VF.
1388   /// The sets depend on CM decision for Load/Store instructions
1389   /// that may be vectorized as interleave, gather-scatter or scalarized.
1390   void collectUniformsAndScalars(ElementCount VF) {
1391     // Do the analysis once.
1392     if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
1393       return;
1394     setCostBasedWideningDecision(VF);
1395     collectLoopUniforms(VF);
1396     collectLoopScalars(VF);
1397   }
1398 
1399   /// Returns true if the target machine supports masked store operation
1400   /// for the given \p DataType and kind of access to \p Ptr.
1401   bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) const {
1402     return Legal->isConsecutivePtr(DataType, Ptr) &&
1403            TTI.isLegalMaskedStore(DataType, Alignment);
1404   }
1405 
1406   /// Returns true if the target machine supports masked load operation
1407   /// for the given \p DataType and kind of access to \p Ptr.
1408   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) const {
1409     return Legal->isConsecutivePtr(DataType, Ptr) &&
1410            TTI.isLegalMaskedLoad(DataType, Alignment);
1411   }
1412 
1413   /// Returns true if the target machine can represent \p V as a masked gather
1414   /// or scatter operation.
1415   bool isLegalGatherOrScatter(Value *V,
1416                               ElementCount VF = ElementCount::getFixed(1)) {
1417     bool LI = isa<LoadInst>(V);
1418     bool SI = isa<StoreInst>(V);
1419     if (!LI && !SI)
1420       return false;
1421     auto *Ty = getLoadStoreType(V);
1422     Align Align = getLoadStoreAlignment(V);
1423     if (VF.isVector())
1424       Ty = VectorType::get(Ty, VF);
1425     return (LI && TTI.isLegalMaskedGather(Ty, Align)) ||
1426            (SI && TTI.isLegalMaskedScatter(Ty, Align));
1427   }
1428 
1429   /// Returns true if the target machine supports all of the reduction
1430   /// variables found for the given VF.
1431   bool canVectorizeReductions(ElementCount VF) const {
1432     return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
1433       const RecurrenceDescriptor &RdxDesc = Reduction.second;
1434       return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1435     }));
1436   }
1437 
1438   /// Returns true if \p I is an instruction that will be scalarized with
1439   /// predication when vectorizing \p I with vectorization factor \p VF. Such
1440   /// instructions include conditional stores and instructions that may divide
1441   /// by zero.
1442   bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1443 
1444   // Returns true if \p I is an instruction that will be predicated either
1445   // through scalar predication or masked load/store or masked gather/scatter.
1446   // \p VF is the vectorization factor that will be used to vectorize \p I.
1447   // Superset of instructions that return true for isScalarWithPredication.
1448   bool isPredicatedInst(Instruction *I, ElementCount VF) {
1449     // When we know the load's address is loop invariant and the instruction
1450     // in the original scalar loop was unconditionally executed then we
1451     // don't need to mark it as a predicated instruction. Tail folding may
1452     // introduce additional predication, but we're guaranteed to always have
1453     // at least one active lane.  We call Legal->blockNeedsPredication here
1454     // because it doesn't query tail-folding.
1455     if (Legal->isUniformMemOp(*I) && isa<LoadInst>(I) &&
1456         !Legal->blockNeedsPredication(I->getParent()))
1457       return false;
1458     if (!blockNeedsPredicationForAnyReason(I->getParent()))
1459       return false;
1460     // Loads and stores that need some form of masked operation are predicated
1461     // instructions.
1462     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1463       return Legal->isMaskRequired(I);
1464     return isScalarWithPredication(I, VF);
1465   }
1466 
1467   /// Returns true if \p I is a memory instruction with consecutive memory
1468   /// access that can be widened.
1469   bool
1470   memoryInstructionCanBeWidened(Instruction *I,
1471                                 ElementCount VF = ElementCount::getFixed(1));
1472 
1473   /// Returns true if \p I is a memory instruction in an interleaved-group
1474   /// of memory accesses that can be vectorized with wide vector loads/stores
1475   /// and shuffles.
1476   bool
1477   interleavedAccessCanBeWidened(Instruction *I,
1478                                 ElementCount VF = ElementCount::getFixed(1));
1479 
1480   /// Check if \p Instr belongs to any interleaved access group.
1481   bool isAccessInterleaved(Instruction *Instr) {
1482     return InterleaveInfo.isInterleaved(Instr);
1483   }
1484 
1485   /// Get the interleaved access group that \p Instr belongs to.
1486   const InterleaveGroup<Instruction> *
1487   getInterleavedAccessGroup(Instruction *Instr) {
1488     return InterleaveInfo.getInterleaveGroup(Instr);
1489   }
1490 
1491   /// Returns true if we're required to use a scalar epilogue for at least
1492   /// the final iteration of the original loop.
1493   bool requiresScalarEpilogue(ElementCount VF) const {
1494     if (!isScalarEpilogueAllowed())
1495       return false;
1496     // If we might exit from anywhere but the latch, must run the exiting
1497     // iteration in scalar form.
1498     if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
1499       return true;
1500     return VF.isVector() && InterleaveInfo.requiresScalarEpilogue();
1501   }
1502 
1503   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1504   /// loop hint annotation.
1505   bool isScalarEpilogueAllowed() const {
1506     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1507   }
1508 
1509   /// Returns true if all loop blocks should be masked to fold tail loop.
1510   bool foldTailByMasking() const { return FoldTailByMasking; }
1511 
1512   /// Returns true if were tail-folding and want to use the active lane mask
1513   /// for vector loop control flow.
1514   bool useActiveLaneMaskForControlFlow() const {
1515     return FoldTailByMasking &&
1516            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
1517   }
1518 
1519   /// Returns true if the instructions in this block requires predication
1520   /// for any reason, e.g. because tail folding now requires a predicate
1521   /// or because the block in the original loop was predicated.
1522   bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1523     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1524   }
1525 
1526   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
1527   /// nodes to the chain of instructions representing the reductions. Uses a
1528   /// MapVector to ensure deterministic iteration order.
1529   using ReductionChainMap =
1530       SmallMapVector<PHINode *, SmallVector<Instruction *, 4>, 4>;
1531 
1532   /// Return the chain of instructions representing an inloop reduction.
1533   const ReductionChainMap &getInLoopReductionChains() const {
1534     return InLoopReductionChains;
1535   }
1536 
1537   /// Returns true if the Phi is part of an inloop reduction.
1538   bool isInLoopReduction(PHINode *Phi) const {
1539     return InLoopReductionChains.count(Phi);
1540   }
1541 
1542   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1543   /// with factor VF.  Return the cost of the instruction, including
1544   /// scalarization overhead if it's needed.
1545   InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1546 
1547   /// Estimate cost of a call instruction CI if it were vectorized with factor
1548   /// VF. Return the cost of the instruction, including scalarization overhead
1549   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1550   /// scalarized -
1551   /// i.e. either vector version isn't available, or is too expensive.
1552   InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
1553                                     bool &NeedToScalarize) const;
1554 
1555   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
1556   /// that of B.
1557   bool isMoreProfitable(const VectorizationFactor &A,
1558                         const VectorizationFactor &B) const;
1559 
1560   /// Invalidates decisions already taken by the cost model.
1561   void invalidateCostModelingDecisions() {
1562     WideningDecisions.clear();
1563     Uniforms.clear();
1564     Scalars.clear();
1565   }
1566 
1567   /// Convenience function that returns the value of vscale_range iff
1568   /// vscale_range.min == vscale_range.max or otherwise returns the value
1569   /// returned by the corresponding TLI method.
1570   Optional<unsigned> getVScaleForTuning() const;
1571 
1572 private:
1573   unsigned NumPredStores = 0;
1574 
1575   /// \return An upper bound for the vectorization factors for both
1576   /// fixed and scalable vectorization, where the minimum-known number of
1577   /// elements is a power-of-2 larger than zero. If scalable vectorization is
1578   /// disabled or unsupported, then the scalable part will be equal to
1579   /// ElementCount::getScalable(0).
1580   FixedScalableVFPair computeFeasibleMaxVF(unsigned ConstTripCount,
1581                                            ElementCount UserVF,
1582                                            bool FoldTailByMasking);
1583 
1584   /// \return the maximized element count based on the targets vector
1585   /// registers and the loop trip-count, but limited to a maximum safe VF.
1586   /// This is a helper function of computeFeasibleMaxVF.
1587   ElementCount getMaximizedVFForTarget(unsigned ConstTripCount,
1588                                        unsigned SmallestType,
1589                                        unsigned WidestType,
1590                                        ElementCount MaxSafeVF,
1591                                        bool FoldTailByMasking);
1592 
1593   /// \return the maximum legal scalable VF, based on the safe max number
1594   /// of elements.
1595   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1596 
1597   /// The vectorization cost is a combination of the cost itself and a boolean
1598   /// indicating whether any of the contributing operations will actually
1599   /// operate on vector values after type legalization in the backend. If this
1600   /// latter value is false, then all operations will be scalarized (i.e. no
1601   /// vectorization has actually taken place).
1602   using VectorizationCostTy = std::pair<InstructionCost, bool>;
1603 
1604   /// Returns the expected execution cost. The unit of the cost does
1605   /// not matter because we use the 'cost' units to compare different
1606   /// vector widths. The cost that is returned is *not* normalized by
1607   /// the factor width. If \p Invalid is not nullptr, this function
1608   /// will add a pair(Instruction*, ElementCount) to \p Invalid for
1609   /// each instruction that has an Invalid cost for the given VF.
1610   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
1611   VectorizationCostTy
1612   expectedCost(ElementCount VF,
1613                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
1614 
1615   /// Returns the execution time cost of an instruction for a given vector
1616   /// width. Vector width of one means scalar.
1617   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1618 
1619   /// The cost-computation logic from getInstructionCost which provides
1620   /// the vector type as an output parameter.
1621   InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1622                                      Type *&VectorTy);
1623 
1624   /// Return the cost of instructions in an inloop reduction pattern, if I is
1625   /// part of that pattern.
1626   Optional<InstructionCost>
1627   getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1628                           TTI::TargetCostKind CostKind);
1629 
1630   /// Calculate vectorization cost of memory instruction \p I.
1631   InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1632 
1633   /// The cost computation for scalarized memory instruction.
1634   InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1635 
1636   /// The cost computation for interleaving group of memory instructions.
1637   InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1638 
1639   /// The cost computation for Gather/Scatter instruction.
1640   InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1641 
1642   /// The cost computation for widening instruction \p I with consecutive
1643   /// memory access.
1644   InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1645 
1646   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1647   /// Load: scalar load + broadcast.
1648   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1649   /// element)
1650   InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1651 
1652   /// Estimate the overhead of scalarizing an instruction. This is a
1653   /// convenience wrapper for the type-based getScalarizationOverhead API.
1654   InstructionCost getScalarizationOverhead(Instruction *I,
1655                                            ElementCount VF) const;
1656 
1657   /// Returns true if an artificially high cost for emulated masked memrefs
1658   /// should be used.
1659   bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1660 
1661   /// Map of scalar integer values to the smallest bitwidth they can be legally
1662   /// represented as. The vector equivalents of these values should be truncated
1663   /// to this type.
1664   MapVector<Instruction *, uint64_t> MinBWs;
1665 
1666   /// A type representing the costs for instructions if they were to be
1667   /// scalarized rather than vectorized. The entries are Instruction-Cost
1668   /// pairs.
1669   using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1670 
1671   /// A set containing all BasicBlocks that are known to present after
1672   /// vectorization as a predicated block.
1673   DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1674       PredicatedBBsAfterVectorization;
1675 
1676   /// Records whether it is allowed to have the original scalar loop execute at
1677   /// least once. This may be needed as a fallback loop in case runtime
1678   /// aliasing/dependence checks fail, or to handle the tail/remainder
1679   /// iterations when the trip count is unknown or doesn't divide by the VF,
1680   /// or as a peel-loop to handle gaps in interleave-groups.
1681   /// Under optsize and when the trip count is very small we don't allow any
1682   /// iterations to execute in the scalar loop.
1683   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1684 
1685   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1686   bool FoldTailByMasking = false;
1687 
1688   /// A map holding scalar costs for different vectorization factors. The
1689   /// presence of a cost for an instruction in the mapping indicates that the
1690   /// instruction will be scalarized when vectorizing with the associated
1691   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1692   DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1693 
1694   /// Holds the instructions known to be uniform after vectorization.
1695   /// The data is collected per VF.
1696   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1697 
1698   /// Holds the instructions known to be scalar after vectorization.
1699   /// The data is collected per VF.
1700   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1701 
1702   /// Holds the instructions (address computations) that are forced to be
1703   /// scalarized.
1704   DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1705 
1706   /// PHINodes of the reductions that should be expanded in-loop along with
1707   /// their associated chains of reduction operations, in program order from top
1708   /// (PHI) to bottom
1709   ReductionChainMap InLoopReductionChains;
1710 
1711   /// A Map of inloop reduction operations and their immediate chain operand.
1712   /// FIXME: This can be removed once reductions can be costed correctly in
1713   /// vplan. This was added to allow quick lookup to the inloop operations,
1714   /// without having to loop through InLoopReductionChains.
1715   DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1716 
1717   /// Returns the expected difference in cost from scalarizing the expression
1718   /// feeding a predicated instruction \p PredInst. The instructions to
1719   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1720   /// non-negative return value implies the expression will be scalarized.
1721   /// Currently, only single-use chains are considered for scalarization.
1722   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1723                               ElementCount VF);
1724 
1725   /// Collect the instructions that are uniform after vectorization. An
1726   /// instruction is uniform if we represent it with a single scalar value in
1727   /// the vectorized loop corresponding to each vector iteration. Examples of
1728   /// uniform instructions include pointer operands of consecutive or
1729   /// interleaved memory accesses. Note that although uniformity implies an
1730   /// instruction will be scalar, the reverse is not true. In general, a
1731   /// scalarized instruction will be represented by VF scalar values in the
1732   /// vectorized loop, each corresponding to an iteration of the original
1733   /// scalar loop.
1734   void collectLoopUniforms(ElementCount VF);
1735 
1736   /// Collect the instructions that are scalar after vectorization. An
1737   /// instruction is scalar if it is known to be uniform or will be scalarized
1738   /// during vectorization. collectLoopScalars should only add non-uniform nodes
1739   /// to the list if they are used by a load/store instruction that is marked as
1740   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1741   /// VF values in the vectorized loop, each corresponding to an iteration of
1742   /// the original scalar loop.
1743   void collectLoopScalars(ElementCount VF);
1744 
1745   /// Keeps cost model vectorization decision and cost for instructions.
1746   /// Right now it is used for memory instructions only.
1747   using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1748                                 std::pair<InstWidening, InstructionCost>>;
1749 
1750   DecisionList WideningDecisions;
1751 
1752   /// Returns true if \p V is expected to be vectorized and it needs to be
1753   /// extracted.
1754   bool needsExtract(Value *V, ElementCount VF) const {
1755     Instruction *I = dyn_cast<Instruction>(V);
1756     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
1757         TheLoop->isLoopInvariant(I))
1758       return false;
1759 
1760     // Assume we can vectorize V (and hence we need extraction) if the
1761     // scalars are not computed yet. This can happen, because it is called
1762     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1763     // the scalars are collected. That should be a safe assumption in most
1764     // cases, because we check if the operands have vectorizable types
1765     // beforehand in LoopVectorizationLegality.
1766     return Scalars.find(VF) == Scalars.end() ||
1767            !isScalarAfterVectorization(I, VF);
1768   };
1769 
1770   /// Returns a range containing only operands needing to be extracted.
1771   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1772                                                    ElementCount VF) const {
1773     return SmallVector<Value *, 4>(make_filter_range(
1774         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1775   }
1776 
1777   /// Determines if we have the infrastructure to vectorize loop \p L and its
1778   /// epilogue, assuming the main loop is vectorized by \p VF.
1779   bool isCandidateForEpilogueVectorization(const Loop &L,
1780                                            const ElementCount VF) const;
1781 
1782   /// Returns true if epilogue vectorization is considered profitable, and
1783   /// false otherwise.
1784   /// \p VF is the vectorization factor chosen for the original loop.
1785   bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
1786 
1787 public:
1788   /// The loop that we evaluate.
1789   Loop *TheLoop;
1790 
1791   /// Predicated scalar evolution analysis.
1792   PredicatedScalarEvolution &PSE;
1793 
1794   /// Loop Info analysis.
1795   LoopInfo *LI;
1796 
1797   /// Vectorization legality.
1798   LoopVectorizationLegality *Legal;
1799 
1800   /// Vector target information.
1801   const TargetTransformInfo &TTI;
1802 
1803   /// Target Library Info.
1804   const TargetLibraryInfo *TLI;
1805 
1806   /// Demanded bits analysis.
1807   DemandedBits *DB;
1808 
1809   /// Assumption cache.
1810   AssumptionCache *AC;
1811 
1812   /// Interface to emit optimization remarks.
1813   OptimizationRemarkEmitter *ORE;
1814 
1815   const Function *TheFunction;
1816 
1817   /// Loop Vectorize Hint.
1818   const LoopVectorizeHints *Hints;
1819 
1820   /// The interleave access information contains groups of interleaved accesses
1821   /// with the same stride and close to each other.
1822   InterleavedAccessInfo &InterleaveInfo;
1823 
1824   /// Values to ignore in the cost model.
1825   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1826 
1827   /// Values to ignore in the cost model when VF > 1.
1828   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1829 
1830   /// All element types found in the loop.
1831   SmallPtrSet<Type *, 16> ElementTypesInLoop;
1832 
1833   /// Profitable vector factors.
1834   SmallVector<VectorizationFactor, 8> ProfitableVFs;
1835 };
1836 } // end namespace llvm
1837 
1838 /// Helper struct to manage generating runtime checks for vectorization.
1839 ///
1840 /// The runtime checks are created up-front in temporary blocks to allow better
1841 /// estimating the cost and un-linked from the existing IR. After deciding to
1842 /// vectorize, the checks are moved back. If deciding not to vectorize, the
1843 /// temporary blocks are completely removed.
1844 class GeneratedRTChecks {
1845   /// Basic block which contains the generated SCEV checks, if any.
1846   BasicBlock *SCEVCheckBlock = nullptr;
1847 
1848   /// The value representing the result of the generated SCEV checks. If it is
1849   /// nullptr, either no SCEV checks have been generated or they have been used.
1850   Value *SCEVCheckCond = nullptr;
1851 
1852   /// Basic block which contains the generated memory runtime checks, if any.
1853   BasicBlock *MemCheckBlock = nullptr;
1854 
1855   /// The value representing the result of the generated memory runtime checks.
1856   /// If it is nullptr, either no memory runtime checks have been generated or
1857   /// they have been used.
1858   Value *MemRuntimeCheckCond = nullptr;
1859 
1860   DominatorTree *DT;
1861   LoopInfo *LI;
1862   TargetTransformInfo *TTI;
1863 
1864   SCEVExpander SCEVExp;
1865   SCEVExpander MemCheckExp;
1866 
1867   bool CostTooHigh = false;
1868 
1869 public:
1870   GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
1871                     TargetTransformInfo *TTI, const DataLayout &DL)
1872       : DT(DT), LI(LI), TTI(TTI), SCEVExp(SE, DL, "scev.check"),
1873         MemCheckExp(SE, DL, "scev.check") {}
1874 
1875   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1876   /// accurately estimate the cost of the runtime checks. The blocks are
1877   /// un-linked from the IR and is added back during vector code generation. If
1878   /// there is no vector code generation, the check blocks are removed
1879   /// completely.
1880   void Create(Loop *L, const LoopAccessInfo &LAI,
1881               const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1882 
1883     // Hard cutoff to limit compile-time increase in case a very large number of
1884     // runtime checks needs to be generated.
1885     // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1886     // profile info.
1887     CostTooHigh =
1888         LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1889     if (CostTooHigh)
1890       return;
1891 
1892     BasicBlock *LoopHeader = L->getHeader();
1893     BasicBlock *Preheader = L->getLoopPreheader();
1894 
1895     // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1896     // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1897     // may be used by SCEVExpander. The blocks will be un-linked from their
1898     // predecessors and removed from LI & DT at the end of the function.
1899     if (!UnionPred.isAlwaysTrue()) {
1900       SCEVCheckBlock = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI,
1901                                   nullptr, "vector.scevcheck");
1902 
1903       SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1904           &UnionPred, SCEVCheckBlock->getTerminator());
1905     }
1906 
1907     const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1908     if (RtPtrChecking.Need) {
1909       auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1910       MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1911                                  "vector.memcheck");
1912 
1913       auto DiffChecks = RtPtrChecking.getDiffChecks();
1914       if (DiffChecks) {
1915         Value *RuntimeVF = nullptr;
1916         MemRuntimeCheckCond = addDiffRuntimeChecks(
1917             MemCheckBlock->getTerminator(), L, *DiffChecks, MemCheckExp,
1918             [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1919               if (!RuntimeVF)
1920                 RuntimeVF = getRuntimeVF(B, B.getIntNTy(Bits), VF);
1921               return RuntimeVF;
1922             },
1923             IC);
1924       } else {
1925         MemRuntimeCheckCond =
1926             addRuntimeChecks(MemCheckBlock->getTerminator(), L,
1927                              RtPtrChecking.getChecks(), MemCheckExp);
1928       }
1929       assert(MemRuntimeCheckCond &&
1930              "no RT checks generated although RtPtrChecking "
1931              "claimed checks are required");
1932     }
1933 
1934     if (!MemCheckBlock && !SCEVCheckBlock)
1935       return;
1936 
1937     // Unhook the temporary block with the checks, update various places
1938     // accordingly.
1939     if (SCEVCheckBlock)
1940       SCEVCheckBlock->replaceAllUsesWith(Preheader);
1941     if (MemCheckBlock)
1942       MemCheckBlock->replaceAllUsesWith(Preheader);
1943 
1944     if (SCEVCheckBlock) {
1945       SCEVCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1946       new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1947       Preheader->getTerminator()->eraseFromParent();
1948     }
1949     if (MemCheckBlock) {
1950       MemCheckBlock->getTerminator()->moveBefore(Preheader->getTerminator());
1951       new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1952       Preheader->getTerminator()->eraseFromParent();
1953     }
1954 
1955     DT->changeImmediateDominator(LoopHeader, Preheader);
1956     if (MemCheckBlock) {
1957       DT->eraseNode(MemCheckBlock);
1958       LI->removeBlock(MemCheckBlock);
1959     }
1960     if (SCEVCheckBlock) {
1961       DT->eraseNode(SCEVCheckBlock);
1962       LI->removeBlock(SCEVCheckBlock);
1963     }
1964   }
1965 
1966   InstructionCost getCost() {
1967     if (SCEVCheckBlock || MemCheckBlock)
1968       LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1969 
1970     if (CostTooHigh) {
1971       InstructionCost Cost;
1972       Cost.setInvalid();
1973       LLVM_DEBUG(dbgs() << "  number of checks exceeded threshold\n");
1974       return Cost;
1975     }
1976 
1977     InstructionCost RTCheckCost = 0;
1978     if (SCEVCheckBlock)
1979       for (Instruction &I : *SCEVCheckBlock) {
1980         if (SCEVCheckBlock->getTerminator() == &I)
1981           continue;
1982         InstructionCost C =
1983             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1984         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1985         RTCheckCost += C;
1986       }
1987     if (MemCheckBlock)
1988       for (Instruction &I : *MemCheckBlock) {
1989         if (MemCheckBlock->getTerminator() == &I)
1990           continue;
1991         InstructionCost C =
1992             TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
1993         LLVM_DEBUG(dbgs() << "  " << C << "  for " << I << "\n");
1994         RTCheckCost += C;
1995       }
1996 
1997     if (SCEVCheckBlock || MemCheckBlock)
1998       LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1999                         << "\n");
2000 
2001     return RTCheckCost;
2002   }
2003 
2004   /// Remove the created SCEV & memory runtime check blocks & instructions, if
2005   /// unused.
2006   ~GeneratedRTChecks() {
2007     SCEVExpanderCleaner SCEVCleaner(SCEVExp);
2008     SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
2009     if (!SCEVCheckCond)
2010       SCEVCleaner.markResultUsed();
2011 
2012     if (!MemRuntimeCheckCond)
2013       MemCheckCleaner.markResultUsed();
2014 
2015     if (MemRuntimeCheckCond) {
2016       auto &SE = *MemCheckExp.getSE();
2017       // Memory runtime check generation creates compares that use expanded
2018       // values. Remove them before running the SCEVExpanderCleaners.
2019       for (auto &I : make_early_inc_range(reverse(*MemCheckBlock))) {
2020         if (MemCheckExp.isInsertedInstruction(&I))
2021           continue;
2022         SE.forgetValue(&I);
2023         I.eraseFromParent();
2024       }
2025     }
2026     MemCheckCleaner.cleanup();
2027     SCEVCleaner.cleanup();
2028 
2029     if (SCEVCheckCond)
2030       SCEVCheckBlock->eraseFromParent();
2031     if (MemRuntimeCheckCond)
2032       MemCheckBlock->eraseFromParent();
2033   }
2034 
2035   /// Adds the generated SCEVCheckBlock before \p LoopVectorPreHeader and
2036   /// adjusts the branches to branch to the vector preheader or \p Bypass,
2037   /// depending on the generated condition.
2038   BasicBlock *emitSCEVChecks(BasicBlock *Bypass,
2039                              BasicBlock *LoopVectorPreHeader,
2040                              BasicBlock *LoopExitBlock) {
2041     if (!SCEVCheckCond)
2042       return nullptr;
2043 
2044     Value *Cond = SCEVCheckCond;
2045     // Mark the check as used, to prevent it from being removed during cleanup.
2046     SCEVCheckCond = nullptr;
2047     if (auto *C = dyn_cast<ConstantInt>(Cond))
2048       if (C->isZero())
2049         return nullptr;
2050 
2051     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2052 
2053     BranchInst::Create(LoopVectorPreHeader, SCEVCheckBlock);
2054     // Create new preheader for vector loop.
2055     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2056       PL->addBasicBlockToLoop(SCEVCheckBlock, *LI);
2057 
2058     SCEVCheckBlock->getTerminator()->eraseFromParent();
2059     SCEVCheckBlock->moveBefore(LoopVectorPreHeader);
2060     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2061                                                 SCEVCheckBlock);
2062 
2063     DT->addNewBlock(SCEVCheckBlock, Pred);
2064     DT->changeImmediateDominator(LoopVectorPreHeader, SCEVCheckBlock);
2065 
2066     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(),
2067                         BranchInst::Create(Bypass, LoopVectorPreHeader, Cond));
2068     return SCEVCheckBlock;
2069   }
2070 
2071   /// Adds the generated MemCheckBlock before \p LoopVectorPreHeader and adjusts
2072   /// the branches to branch to the vector preheader or \p Bypass, depending on
2073   /// the generated condition.
2074   BasicBlock *emitMemRuntimeChecks(BasicBlock *Bypass,
2075                                    BasicBlock *LoopVectorPreHeader) {
2076     // Check if we generated code that checks in runtime if arrays overlap.
2077     if (!MemRuntimeCheckCond)
2078       return nullptr;
2079 
2080     auto *Pred = LoopVectorPreHeader->getSinglePredecessor();
2081     Pred->getTerminator()->replaceSuccessorWith(LoopVectorPreHeader,
2082                                                 MemCheckBlock);
2083 
2084     DT->addNewBlock(MemCheckBlock, Pred);
2085     DT->changeImmediateDominator(LoopVectorPreHeader, MemCheckBlock);
2086     MemCheckBlock->moveBefore(LoopVectorPreHeader);
2087 
2088     if (auto *PL = LI->getLoopFor(LoopVectorPreHeader))
2089       PL->addBasicBlockToLoop(MemCheckBlock, *LI);
2090 
2091     ReplaceInstWithInst(
2092         MemCheckBlock->getTerminator(),
2093         BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond));
2094     MemCheckBlock->getTerminator()->setDebugLoc(
2095         Pred->getTerminator()->getDebugLoc());
2096 
2097     // Mark the check as used, to prevent it from being removed during cleanup.
2098     MemRuntimeCheckCond = nullptr;
2099     return MemCheckBlock;
2100   }
2101 };
2102 
2103 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
2104 // vectorization. The loop needs to be annotated with #pragma omp simd
2105 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2106 // vector length information is not provided, vectorization is not considered
2107 // explicit. Interleave hints are not allowed either. These limitations will be
2108 // relaxed in the future.
2109 // Please, note that we are currently forced to abuse the pragma 'clang
2110 // vectorize' semantics. This pragma provides *auto-vectorization hints*
2111 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2112 // provides *explicit vectorization hints* (LV can bypass legal checks and
2113 // assume that vectorization is legal). However, both hints are implemented
2114 // using the same metadata (llvm.loop.vectorize, processed by
2115 // LoopVectorizeHints). This will be fixed in the future when the native IR
2116 // representation for pragma 'omp simd' is introduced.
2117 static bool isExplicitVecOuterLoop(Loop *OuterLp,
2118                                    OptimizationRemarkEmitter *ORE) {
2119   assert(!OuterLp->isInnermost() && "This is not an outer loop");
2120   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2121 
2122   // Only outer loops with an explicit vectorization hint are supported.
2123   // Unannotated outer loops are ignored.
2124   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2125     return false;
2126 
2127   Function *Fn = OuterLp->getHeader()->getParent();
2128   if (!Hints.allowVectorization(Fn, OuterLp,
2129                                 true /*VectorizeOnlyWhenForced*/)) {
2130     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2131     return false;
2132   }
2133 
2134   if (Hints.getInterleave() > 1) {
2135     // TODO: Interleave support is future work.
2136     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2137                          "outer loops.\n");
2138     Hints.emitRemarkWithHints();
2139     return false;
2140   }
2141 
2142   return true;
2143 }
2144 
2145 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2146                                   OptimizationRemarkEmitter *ORE,
2147                                   SmallVectorImpl<Loop *> &V) {
2148   // Collect inner loops and outer loops without irreducible control flow. For
2149   // now, only collect outer loops that have explicit vectorization hints. If we
2150   // are stress testing the VPlan H-CFG construction, we collect the outermost
2151   // loop of every loop nest.
2152   if (L.isInnermost() || VPlanBuildStressTest ||
2153       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
2154     LoopBlocksRPO RPOT(&L);
2155     RPOT.perform(LI);
2156     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
2157       V.push_back(&L);
2158       // TODO: Collect inner loops inside marked outer loops in case
2159       // vectorization fails for the outer loop. Do not invoke
2160       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2161       // already known to be reducible. We can use an inherited attribute for
2162       // that.
2163       return;
2164     }
2165   }
2166   for (Loop *InnerL : L)
2167     collectSupportedLoops(*InnerL, LI, ORE, V);
2168 }
2169 
2170 namespace {
2171 
2172 /// The LoopVectorize Pass.
2173 struct LoopVectorize : public FunctionPass {
2174   /// Pass identification, replacement for typeid
2175   static char ID;
2176 
2177   LoopVectorizePass Impl;
2178 
2179   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
2180                          bool VectorizeOnlyWhenForced = false)
2181       : FunctionPass(ID),
2182         Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
2183     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
2184   }
2185 
2186   bool runOnFunction(Function &F) override {
2187     if (skipFunction(F))
2188       return false;
2189 
2190     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
2191     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
2192     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
2193     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
2194     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
2195     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
2196     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
2197     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
2198     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2199     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
2200     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
2201     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
2202     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
2203 
2204     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
2205         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
2206 
2207     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
2208                         GetLAA, *ORE, PSI).MadeAnyChange;
2209   }
2210 
2211   void getAnalysisUsage(AnalysisUsage &AU) const override {
2212     AU.addRequired<AssumptionCacheTracker>();
2213     AU.addRequired<BlockFrequencyInfoWrapperPass>();
2214     AU.addRequired<DominatorTreeWrapperPass>();
2215     AU.addRequired<LoopInfoWrapperPass>();
2216     AU.addRequired<ScalarEvolutionWrapperPass>();
2217     AU.addRequired<TargetTransformInfoWrapperPass>();
2218     AU.addRequired<AAResultsWrapperPass>();
2219     AU.addRequired<LoopAccessLegacyAnalysis>();
2220     AU.addRequired<DemandedBitsWrapperPass>();
2221     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
2222     AU.addRequired<InjectTLIMappingsLegacy>();
2223 
2224     // We currently do not preserve loopinfo/dominator analyses with outer loop
2225     // vectorization. Until this is addressed, mark these analyses as preserved
2226     // only for non-VPlan-native path.
2227     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
2228     if (!EnableVPlanNativePath) {
2229       AU.addPreserved<LoopInfoWrapperPass>();
2230       AU.addPreserved<DominatorTreeWrapperPass>();
2231     }
2232 
2233     AU.addPreserved<BasicAAWrapperPass>();
2234     AU.addPreserved<GlobalsAAWrapperPass>();
2235     AU.addRequired<ProfileSummaryInfoWrapperPass>();
2236   }
2237 };
2238 
2239 } // end anonymous namespace
2240 
2241 //===----------------------------------------------------------------------===//
2242 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2243 // LoopVectorizationCostModel and LoopVectorizationPlanner.
2244 //===----------------------------------------------------------------------===//
2245 
2246 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
2247   // We need to place the broadcast of invariant variables outside the loop,
2248   // but only if it's proven safe to do so. Else, broadcast will be inside
2249   // vector loop body.
2250   Instruction *Instr = dyn_cast<Instruction>(V);
2251   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
2252                      (!Instr ||
2253                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
2254   // Place the code for broadcasting invariant variables in the new preheader.
2255   IRBuilder<>::InsertPointGuard Guard(Builder);
2256   if (SafeToHoist)
2257     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
2258 
2259   // Broadcast the scalar into all locations in the vector.
2260   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
2261 
2262   return Shuf;
2263 }
2264 
2265 /// This function adds
2266 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
2267 /// to each vector element of Val. The sequence starts at StartIndex.
2268 /// \p Opcode is relevant for FP induction variable.
2269 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
2270                             Instruction::BinaryOps BinOp, ElementCount VF,
2271                             IRBuilderBase &Builder) {
2272   assert(VF.isVector() && "only vector VFs are supported");
2273 
2274   // Create and check the types.
2275   auto *ValVTy = cast<VectorType>(Val->getType());
2276   ElementCount VLen = ValVTy->getElementCount();
2277 
2278   Type *STy = Val->getType()->getScalarType();
2279   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
2280          "Induction Step must be an integer or FP");
2281   assert(Step->getType() == STy && "Step has wrong type");
2282 
2283   SmallVector<Constant *, 8> Indices;
2284 
2285   // Create a vector of consecutive numbers from zero to VF.
2286   VectorType *InitVecValVTy = ValVTy;
2287   if (STy->isFloatingPointTy()) {
2288     Type *InitVecValSTy =
2289         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
2290     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
2291   }
2292   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
2293 
2294   // Splat the StartIdx
2295   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
2296 
2297   if (STy->isIntegerTy()) {
2298     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
2299     Step = Builder.CreateVectorSplat(VLen, Step);
2300     assert(Step->getType() == Val->getType() && "Invalid step vec");
2301     // FIXME: The newly created binary instructions should contain nsw/nuw
2302     // flags, which can be found from the original scalar operations.
2303     Step = Builder.CreateMul(InitVec, Step);
2304     return Builder.CreateAdd(Val, Step, "induction");
2305   }
2306 
2307   // Floating point induction.
2308   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
2309          "Binary Opcode should be specified for FP induction");
2310   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
2311   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
2312 
2313   Step = Builder.CreateVectorSplat(VLen, Step);
2314   Value *MulOp = Builder.CreateFMul(InitVec, Step);
2315   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
2316 }
2317 
2318 /// Compute scalar induction steps. \p ScalarIV is the scalar induction
2319 /// variable on which to base the steps, \p Step is the size of the step.
2320 static void buildScalarSteps(Value *ScalarIV, Value *Step,
2321                              const InductionDescriptor &ID, VPValue *Def,
2322                              VPTransformState &State) {
2323   IRBuilderBase &Builder = State.Builder;
2324   // We shouldn't have to build scalar steps if we aren't vectorizing.
2325   assert(State.VF.isVector() && "VF should be greater than one");
2326   // Get the value type and ensure it and the step have the same integer type.
2327   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
2328   assert(ScalarIVTy == Step->getType() &&
2329          "Val and Step should have the same type");
2330 
2331   // We build scalar steps for both integer and floating-point induction
2332   // variables. Here, we determine the kind of arithmetic we will perform.
2333   Instruction::BinaryOps AddOp;
2334   Instruction::BinaryOps MulOp;
2335   if (ScalarIVTy->isIntegerTy()) {
2336     AddOp = Instruction::Add;
2337     MulOp = Instruction::Mul;
2338   } else {
2339     AddOp = ID.getInductionOpcode();
2340     MulOp = Instruction::FMul;
2341   }
2342 
2343   // Determine the number of scalars we need to generate for each unroll
2344   // iteration.
2345   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(Def);
2346   unsigned Lanes = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
2347   // Compute the scalar steps and save the results in State.
2348   Type *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
2349                                      ScalarIVTy->getScalarSizeInBits());
2350   Type *VecIVTy = nullptr;
2351   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
2352   if (!FirstLaneOnly && State.VF.isScalable()) {
2353     VecIVTy = VectorType::get(ScalarIVTy, State.VF);
2354     UnitStepVec =
2355         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
2356     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
2357     SplatIV = Builder.CreateVectorSplat(State.VF, ScalarIV);
2358   }
2359 
2360   for (unsigned Part = 0; Part < State.UF; ++Part) {
2361     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
2362 
2363     if (!FirstLaneOnly && State.VF.isScalable()) {
2364       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
2365       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
2366       if (ScalarIVTy->isFloatingPointTy())
2367         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
2368       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
2369       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
2370       State.set(Def, Add, Part);
2371       // It's useful to record the lane values too for the known minimum number
2372       // of elements so we do those below. This improves the code quality when
2373       // trying to extract the first element, for example.
2374     }
2375 
2376     if (ScalarIVTy->isFloatingPointTy())
2377       StartIdx0 = Builder.CreateSIToFP(StartIdx0, ScalarIVTy);
2378 
2379     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
2380       Value *StartIdx = Builder.CreateBinOp(
2381           AddOp, StartIdx0, getSignedIntOrFpConstant(ScalarIVTy, Lane));
2382       // The step returned by `createStepForVF` is a runtime-evaluated value
2383       // when VF is scalable. Otherwise, it should be folded into a Constant.
2384       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
2385              "Expected StartIdx to be folded to a constant when VF is not "
2386              "scalable");
2387       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
2388       auto *Add = Builder.CreateBinOp(AddOp, ScalarIV, Mul);
2389       State.set(Def, Add, VPIteration(Part, Lane));
2390     }
2391   }
2392 }
2393 
2394 // Generate code for the induction step. Note that induction steps are
2395 // required to be loop-invariant
2396 static Value *CreateStepValue(const SCEV *Step, ScalarEvolution &SE,
2397                               Instruction *InsertBefore,
2398                               Loop *OrigLoop = nullptr) {
2399   const DataLayout &DL = SE.getDataLayout();
2400   assert((!OrigLoop || SE.isLoopInvariant(Step, OrigLoop)) &&
2401          "Induction step should be loop invariant");
2402   if (auto *E = dyn_cast<SCEVUnknown>(Step))
2403     return E->getValue();
2404 
2405   SCEVExpander Exp(SE, DL, "induction");
2406   return Exp.expandCodeFor(Step, Step->getType(), InsertBefore);
2407 }
2408 
2409 /// Compute the transformed value of Index at offset StartValue using step
2410 /// StepValue.
2411 /// For integer induction, returns StartValue + Index * StepValue.
2412 /// For pointer induction, returns StartValue[Index * StepValue].
2413 /// FIXME: The newly created binary instructions should contain nsw/nuw
2414 /// flags, which can be found from the original scalar operations.
2415 static Value *emitTransformedIndex(IRBuilderBase &B, Value *Index,
2416                                    Value *StartValue, Value *Step,
2417                                    const InductionDescriptor &ID) {
2418   assert(Index->getType()->getScalarType() == Step->getType() &&
2419          "Index scalar type does not match StepValue type");
2420 
2421   // Note: the IR at this point is broken. We cannot use SE to create any new
2422   // SCEV and then expand it, hoping that SCEV's simplification will give us
2423   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2424   // lead to various SCEV crashes. So all we can do is to use builder and rely
2425   // on InstCombine for future simplifications. Here we handle some trivial
2426   // cases only.
2427   auto CreateAdd = [&B](Value *X, Value *Y) {
2428     assert(X->getType() == Y->getType() && "Types don't match!");
2429     if (auto *CX = dyn_cast<ConstantInt>(X))
2430       if (CX->isZero())
2431         return Y;
2432     if (auto *CY = dyn_cast<ConstantInt>(Y))
2433       if (CY->isZero())
2434         return X;
2435     return B.CreateAdd(X, Y);
2436   };
2437 
2438   // We allow X to be a vector type, in which case Y will potentially be
2439   // splatted into a vector with the same element count.
2440   auto CreateMul = [&B](Value *X, Value *Y) {
2441     assert(X->getType()->getScalarType() == Y->getType() &&
2442            "Types don't match!");
2443     if (auto *CX = dyn_cast<ConstantInt>(X))
2444       if (CX->isOne())
2445         return Y;
2446     if (auto *CY = dyn_cast<ConstantInt>(Y))
2447       if (CY->isOne())
2448         return X;
2449     VectorType *XVTy = dyn_cast<VectorType>(X->getType());
2450     if (XVTy && !isa<VectorType>(Y->getType()))
2451       Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
2452     return B.CreateMul(X, Y);
2453   };
2454 
2455   switch (ID.getKind()) {
2456   case InductionDescriptor::IK_IntInduction: {
2457     assert(!isa<VectorType>(Index->getType()) &&
2458            "Vector indices not supported for integer inductions yet");
2459     assert(Index->getType() == StartValue->getType() &&
2460            "Index type does not match StartValue type");
2461     if (isa<ConstantInt>(Step) && cast<ConstantInt>(Step)->isMinusOne())
2462       return B.CreateSub(StartValue, Index);
2463     auto *Offset = CreateMul(Index, Step);
2464     return CreateAdd(StartValue, Offset);
2465   }
2466   case InductionDescriptor::IK_PtrInduction: {
2467     assert(isa<Constant>(Step) &&
2468            "Expected constant step for pointer induction");
2469     return B.CreateGEP(ID.getElementType(), StartValue, CreateMul(Index, Step));
2470   }
2471   case InductionDescriptor::IK_FpInduction: {
2472     assert(!isa<VectorType>(Index->getType()) &&
2473            "Vector indices not supported for FP inductions yet");
2474     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2475     auto InductionBinOp = ID.getInductionBinOp();
2476     assert(InductionBinOp &&
2477            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2478             InductionBinOp->getOpcode() == Instruction::FSub) &&
2479            "Original bin op should be defined for FP induction");
2480 
2481     Value *MulExp = B.CreateFMul(Step, Index);
2482     return B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2483                          "induction");
2484   }
2485   case InductionDescriptor::IK_NoInduction:
2486     return nullptr;
2487   }
2488   llvm_unreachable("invalid enum");
2489 }
2490 
2491 void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
2492                                                     const VPIteration &Instance,
2493                                                     VPTransformState &State) {
2494   Value *ScalarInst = State.get(Def, Instance);
2495   Value *VectorValue = State.get(Def, Instance.Part);
2496   VectorValue = Builder.CreateInsertElement(
2497       VectorValue, ScalarInst,
2498       Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
2499   State.set(Def, VectorValue, Instance.Part);
2500 }
2501 
2502 // Return whether we allow using masked interleave-groups (for dealing with
2503 // strided loads/stores that reside in predicated blocks, or for dealing
2504 // with gaps).
2505 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2506   // If an override option has been passed in for interleaved accesses, use it.
2507   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2508     return EnableMaskedInterleavedMemAccesses;
2509 
2510   return TTI.enableMaskedInterleavedAccessVectorization();
2511 }
2512 
2513 // Try to vectorize the interleave group that \p Instr belongs to.
2514 //
2515 // E.g. Translate following interleaved load group (factor = 3):
2516 //   for (i = 0; i < N; i+=3) {
2517 //     R = Pic[i];             // Member of index 0
2518 //     G = Pic[i+1];           // Member of index 1
2519 //     B = Pic[i+2];           // Member of index 2
2520 //     ... // do something to R, G, B
2521 //   }
2522 // To:
2523 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2524 //   %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9>   ; R elements
2525 //   %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10>  ; G elements
2526 //   %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11>  ; B elements
2527 //
2528 // Or translate following interleaved store group (factor = 3):
2529 //   for (i = 0; i < N; i+=3) {
2530 //     ... do something to R, G, B
2531 //     Pic[i]   = R;           // Member of index 0
2532 //     Pic[i+1] = G;           // Member of index 1
2533 //     Pic[i+2] = B;           // Member of index 2
2534 //   }
2535 // To:
2536 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2537 //   %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
2538 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2539 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2540 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
2541 void InnerLoopVectorizer::vectorizeInterleaveGroup(
2542     const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
2543     VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
2544     VPValue *BlockInMask) {
2545   Instruction *Instr = Group->getInsertPos();
2546   const DataLayout &DL = Instr->getModule()->getDataLayout();
2547 
2548   // Prepare for the vector type of the interleaved load/store.
2549   Type *ScalarTy = getLoadStoreType(Instr);
2550   unsigned InterleaveFactor = Group->getFactor();
2551   assert(!VF.isScalable() && "scalable vectors not yet supported.");
2552   auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
2553 
2554   // Prepare for the new pointers.
2555   SmallVector<Value *, 2> AddrParts;
2556   unsigned Index = Group->getIndex(Instr);
2557 
2558   // TODO: extend the masked interleaved-group support to reversed access.
2559   assert((!BlockInMask || !Group->isReverse()) &&
2560          "Reversed masked interleave-group not supported.");
2561 
2562   // If the group is reverse, adjust the index to refer to the last vector lane
2563   // instead of the first. We adjust the index from the first vector lane,
2564   // rather than directly getting the pointer for lane VF - 1, because the
2565   // pointer operand of the interleaved access is supposed to be uniform. For
2566   // uniform instructions, we're only required to generate a value for the
2567   // first vector lane in each unroll iteration.
2568   if (Group->isReverse())
2569     Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
2570 
2571   for (unsigned Part = 0; Part < UF; Part++) {
2572     Value *AddrPart = State.get(Addr, VPIteration(Part, 0));
2573     State.setDebugLocFromInst(AddrPart);
2574 
2575     // Notice current instruction could be any index. Need to adjust the address
2576     // to the member of index 0.
2577     //
2578     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2579     //       b = A[i];       // Member of index 0
2580     // Current pointer is pointed to A[i+1], adjust it to A[i].
2581     //
2582     // E.g.  A[i+1] = a;     // Member of index 1
2583     //       A[i]   = b;     // Member of index 0
2584     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2585     // Current pointer is pointed to A[i+2], adjust it to A[i].
2586 
2587     bool InBounds = false;
2588     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2589       InBounds = gep->isInBounds();
2590     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2591     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2592 
2593     // Cast to the vector pointer type.
2594     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2595     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2596     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2597   }
2598 
2599   State.setDebugLocFromInst(Instr);
2600   Value *PoisonVec = PoisonValue::get(VecTy);
2601 
2602   Value *MaskForGaps = nullptr;
2603   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2604     MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2605     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2606   }
2607 
2608   // Vectorize the interleaved load group.
2609   if (isa<LoadInst>(Instr)) {
2610     // For each unroll part, create a wide load for the group.
2611     SmallVector<Value *, 2> NewLoads;
2612     for (unsigned Part = 0; Part < UF; Part++) {
2613       Instruction *NewLoad;
2614       if (BlockInMask || MaskForGaps) {
2615         assert(useMaskedInterleavedAccesses(*TTI) &&
2616                "masked interleaved groups are not allowed.");
2617         Value *GroupMask = MaskForGaps;
2618         if (BlockInMask) {
2619           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2620           Value *ShuffledMask = Builder.CreateShuffleVector(
2621               BlockInMaskPart,
2622               createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2623               "interleaved.mask");
2624           GroupMask = MaskForGaps
2625                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2626                                                 MaskForGaps)
2627                           : ShuffledMask;
2628         }
2629         NewLoad =
2630             Builder.CreateMaskedLoad(VecTy, AddrParts[Part], Group->getAlign(),
2631                                      GroupMask, PoisonVec, "wide.masked.vec");
2632       }
2633       else
2634         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2635                                             Group->getAlign(), "wide.vec");
2636       Group->addMetadata(NewLoad);
2637       NewLoads.push_back(NewLoad);
2638     }
2639 
2640     // For each member in the group, shuffle out the appropriate data from the
2641     // wide loads.
2642     unsigned J = 0;
2643     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2644       Instruction *Member = Group->getMember(I);
2645 
2646       // Skip the gaps in the group.
2647       if (!Member)
2648         continue;
2649 
2650       auto StrideMask =
2651           createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
2652       for (unsigned Part = 0; Part < UF; Part++) {
2653         Value *StridedVec = Builder.CreateShuffleVector(
2654             NewLoads[Part], StrideMask, "strided.vec");
2655 
2656         // If this member has different type, cast the result type.
2657         if (Member->getType() != ScalarTy) {
2658           assert(!VF.isScalable() && "VF is assumed to be non scalable.");
2659           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2660           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2661         }
2662 
2663         if (Group->isReverse())
2664           StridedVec = Builder.CreateVectorReverse(StridedVec, "reverse");
2665 
2666         State.set(VPDefs[J], StridedVec, Part);
2667       }
2668       ++J;
2669     }
2670     return;
2671   }
2672 
2673   // The sub vector type for current instruction.
2674   auto *SubVT = VectorType::get(ScalarTy, VF);
2675 
2676   // Vectorize the interleaved store group.
2677   MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
2678   assert((!MaskForGaps || useMaskedInterleavedAccesses(*TTI)) &&
2679          "masked interleaved groups are not allowed.");
2680   assert((!MaskForGaps || !VF.isScalable()) &&
2681          "masking gaps for scalable vectors is not yet supported.");
2682   for (unsigned Part = 0; Part < UF; Part++) {
2683     // Collect the stored vector from each member.
2684     SmallVector<Value *, 4> StoredVecs;
2685     for (unsigned i = 0; i < InterleaveFactor; i++) {
2686       assert((Group->getMember(i) || MaskForGaps) &&
2687              "Fail to get a member from an interleaved store group");
2688       Instruction *Member = Group->getMember(i);
2689 
2690       // Skip the gaps in the group.
2691       if (!Member) {
2692         Value *Undef = PoisonValue::get(SubVT);
2693         StoredVecs.push_back(Undef);
2694         continue;
2695       }
2696 
2697       Value *StoredVec = State.get(StoredValues[i], Part);
2698 
2699       if (Group->isReverse())
2700         StoredVec = Builder.CreateVectorReverse(StoredVec, "reverse");
2701 
2702       // If this member has different type, cast it to a unified type.
2703 
2704       if (StoredVec->getType() != SubVT)
2705         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2706 
2707       StoredVecs.push_back(StoredVec);
2708     }
2709 
2710     // Concatenate all vectors into a wide vector.
2711     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2712 
2713     // Interleave the elements in the wide vector.
2714     Value *IVec = Builder.CreateShuffleVector(
2715         WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
2716         "interleaved.vec");
2717 
2718     Instruction *NewStoreInstr;
2719     if (BlockInMask || MaskForGaps) {
2720       Value *GroupMask = MaskForGaps;
2721       if (BlockInMask) {
2722         Value *BlockInMaskPart = State.get(BlockInMask, Part);
2723         Value *ShuffledMask = Builder.CreateShuffleVector(
2724             BlockInMaskPart,
2725             createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
2726             "interleaved.mask");
2727         GroupMask = MaskForGaps ? Builder.CreateBinOp(Instruction::And,
2728                                                       ShuffledMask, MaskForGaps)
2729                                 : ShuffledMask;
2730       }
2731       NewStoreInstr = Builder.CreateMaskedStore(IVec, AddrParts[Part],
2732                                                 Group->getAlign(), GroupMask);
2733     } else
2734       NewStoreInstr =
2735           Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
2736 
2737     Group->addMetadata(NewStoreInstr);
2738   }
2739 }
2740 
2741 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2742                                                VPReplicateRecipe *RepRecipe,
2743                                                const VPIteration &Instance,
2744                                                bool IfPredicateInstr,
2745                                                VPTransformState &State) {
2746   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2747 
2748   // llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
2749   // the first lane and part.
2750   if (isa<NoAliasScopeDeclInst>(Instr))
2751     if (!Instance.isFirstIteration())
2752       return;
2753 
2754   // Does this instruction return a value ?
2755   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2756 
2757   Instruction *Cloned = Instr->clone();
2758   if (!IsVoidRetTy)
2759     Cloned->setName(Instr->getName() + ".cloned");
2760 
2761   // If the scalarized instruction contributes to the address computation of a
2762   // widen masked load/store which was in a basic block that needed predication
2763   // and is not predicated after vectorization, we can't propagate
2764   // poison-generating flags (nuw/nsw, exact, inbounds, etc.). The scalarized
2765   // instruction could feed a poison value to the base address of the widen
2766   // load/store.
2767   if (State.MayGeneratePoisonRecipes.contains(RepRecipe))
2768     Cloned->dropPoisonGeneratingFlags();
2769 
2770   if (Instr->getDebugLoc())
2771     State.setDebugLocFromInst(Instr);
2772 
2773   // Replace the operands of the cloned instructions with their scalar
2774   // equivalents in the new loop.
2775   for (auto &I : enumerate(RepRecipe->operands())) {
2776     auto InputInstance = Instance;
2777     VPValue *Operand = I.value();
2778     VPReplicateRecipe *OperandR = dyn_cast<VPReplicateRecipe>(Operand);
2779     if (OperandR && OperandR->isUniform())
2780       InputInstance.Lane = VPLane::getFirstLane();
2781     Cloned->setOperand(I.index(), State.get(Operand, InputInstance));
2782   }
2783   State.addNewMetadata(Cloned, Instr);
2784 
2785   // Place the cloned scalar in the new loop.
2786   State.Builder.Insert(Cloned);
2787 
2788   State.set(RepRecipe, Cloned, Instance);
2789 
2790   // If we just cloned a new assumption, add it the assumption cache.
2791   if (auto *II = dyn_cast<AssumeInst>(Cloned))
2792     AC->registerAssumption(II);
2793 
2794   // End if-block.
2795   if (IfPredicateInstr)
2796     PredicatedInstructions.push_back(Cloned);
2797 }
2798 
2799 Value *InnerLoopVectorizer::getOrCreateTripCount(BasicBlock *InsertBlock) {
2800   if (TripCount)
2801     return TripCount;
2802 
2803   assert(InsertBlock);
2804   IRBuilder<> Builder(InsertBlock->getTerminator());
2805   // Find the loop boundaries.
2806   ScalarEvolution *SE = PSE.getSE();
2807   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2808   assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
2809          "Invalid loop count");
2810 
2811   Type *IdxTy = Legal->getWidestInductionType();
2812   assert(IdxTy && "No type for induction");
2813 
2814   // The exit count might have the type of i64 while the phi is i32. This can
2815   // happen if we have an induction variable that is sign extended before the
2816   // compare. The only way that we get a backedge taken count is that the
2817   // induction variable was signed and as such will not overflow. In such a case
2818   // truncation is legal.
2819   if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
2820       IdxTy->getPrimitiveSizeInBits())
2821     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2822   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2823 
2824   // Get the total trip count from the count by adding 1.
2825   const SCEV *ExitCount = SE->getAddExpr(
2826       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2827 
2828   const DataLayout &DL = InsertBlock->getModule()->getDataLayout();
2829 
2830   // Expand the trip count and place the new instructions in the preheader.
2831   // Notice that the pre-header does not change, only the loop body.
2832   SCEVExpander Exp(*SE, DL, "induction");
2833 
2834   // Count holds the overall loop count (N).
2835   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2836                                 InsertBlock->getTerminator());
2837 
2838   if (TripCount->getType()->isPointerTy())
2839     TripCount =
2840         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2841                                     InsertBlock->getTerminator());
2842 
2843   return TripCount;
2844 }
2845 
2846 Value *
2847 InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2848   if (VectorTripCount)
2849     return VectorTripCount;
2850 
2851   Value *TC = getOrCreateTripCount(InsertBlock);
2852   IRBuilder<> Builder(InsertBlock->getTerminator());
2853 
2854   Type *Ty = TC->getType();
2855   // This is where we can make the step a runtime constant.
2856   Value *Step = createStepForVF(Builder, Ty, VF, UF);
2857 
2858   // If the tail is to be folded by masking, round the number of iterations N
2859   // up to a multiple of Step instead of rounding down. This is done by first
2860   // adding Step-1 and then rounding down. Note that it's ok if this addition
2861   // overflows: the vector induction variable will eventually wrap to zero given
2862   // that it starts at zero and its Step is a power of two; the loop will then
2863   // exit, with the last early-exit vector comparison also producing all-true.
2864   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2865   // is accounted for in emitIterationCountCheck that adds an overflow check.
2866   if (Cost->foldTailByMasking()) {
2867     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2868            "VF*UF must be a power of 2 when folding tail by masking");
2869     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
2870     TC = Builder.CreateAdd(
2871         TC, Builder.CreateSub(NumLanes, ConstantInt::get(Ty, 1)), "n.rnd.up");
2872   }
2873 
2874   // Now we need to generate the expression for the part of the loop that the
2875   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2876   // iterations are not required for correctness, or N - Step, otherwise. Step
2877   // is equal to the vectorization factor (number of SIMD elements) times the
2878   // unroll factor (number of SIMD instructions).
2879   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2880 
2881   // There are cases where we *must* run at least one iteration in the remainder
2882   // loop.  See the cost model for when this can happen.  If the step evenly
2883   // divides the trip count, we set the remainder to be equal to the step. If
2884   // the step does not evenly divide the trip count, no adjustment is necessary
2885   // since there will already be scalar iterations. Note that the minimum
2886   // iterations check ensures that N >= Step.
2887   if (Cost->requiresScalarEpilogue(VF)) {
2888     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2889     R = Builder.CreateSelect(IsZero, Step, R);
2890   }
2891 
2892   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2893 
2894   return VectorTripCount;
2895 }
2896 
2897 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2898                                                    const DataLayout &DL) {
2899   // Verify that V is a vector type with same number of elements as DstVTy.
2900   auto *DstFVTy = cast<FixedVectorType>(DstVTy);
2901   unsigned VF = DstFVTy->getNumElements();
2902   auto *SrcVecTy = cast<FixedVectorType>(V->getType());
2903   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2904   Type *SrcElemTy = SrcVecTy->getElementType();
2905   Type *DstElemTy = DstFVTy->getElementType();
2906   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2907          "Vector elements must have same size");
2908 
2909   // Do a direct cast if element types are castable.
2910   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2911     return Builder.CreateBitOrPointerCast(V, DstFVTy);
2912   }
2913   // V cannot be directly casted to desired vector type.
2914   // May happen when V is a floating point vector but DstVTy is a vector of
2915   // pointers or vice-versa. Handle this using a two-step bitcast using an
2916   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2917   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2918          "Only one type should be a pointer type");
2919   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2920          "Only one type should be a floating point type");
2921   Type *IntTy =
2922       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2923   auto *VecIntTy = FixedVectorType::get(IntTy, VF);
2924   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2925   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
2926 }
2927 
2928 void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2929   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
2930   // Reuse existing vector loop preheader for TC checks.
2931   // Note that new preheader block is generated for vector loop.
2932   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2933   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2934 
2935   // Generate code to check if the loop's trip count is less than VF * UF, or
2936   // equal to it in case a scalar epilogue is required; this implies that the
2937   // vector trip count is zero. This check also covers the case where adding one
2938   // to the backedge-taken count overflowed leading to an incorrect trip count
2939   // of zero. In this case we will also jump to the scalar loop.
2940   auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2941                                             : ICmpInst::ICMP_ULT;
2942 
2943   // If tail is to be folded, vector loop takes care of all iterations.
2944   Type *CountTy = Count->getType();
2945   Value *CheckMinIters = Builder.getFalse();
2946   auto CreateStep = [&]() -> Value * {
2947     // Create step with max(MinProTripCount, UF * VF).
2948     if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2949       return createStepForVF(Builder, CountTy, VF, UF);
2950 
2951     Value *MinProfTC =
2952         createStepForVF(Builder, CountTy, MinProfitableTripCount, 1);
2953     if (!VF.isScalable())
2954       return MinProfTC;
2955     return Builder.CreateBinaryIntrinsic(
2956         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
2957   };
2958 
2959   if (!Cost->foldTailByMasking())
2960     CheckMinIters =
2961         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
2962   else if (VF.isScalable()) {
2963     // vscale is not necessarily a power-of-2, which means we cannot guarantee
2964     // an overflow to zero when updating induction variables and so an
2965     // additional overflow check is required before entering the vector loop.
2966 
2967     // Get the maximum unsigned value for the type.
2968     Value *MaxUIntTripCount =
2969         ConstantInt::get(CountTy, cast<IntegerType>(CountTy)->getMask());
2970     Value *LHS = Builder.CreateSub(MaxUIntTripCount, Count);
2971 
2972     // Don't execute the vector loop if (UMax - n) < (VF * UF).
2973     CheckMinIters = Builder.CreateICmp(ICmpInst::ICMP_ULT, LHS, CreateStep());
2974   }
2975 
2976   // Create new preheader for vector loop.
2977   LoopVectorPreHeader =
2978       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2979                  "vector.ph");
2980 
2981   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2982                                DT->getNode(Bypass)->getIDom()) &&
2983          "TC check is expected to dominate Bypass");
2984 
2985   // Update dominator for Bypass & LoopExit (if needed).
2986   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2987   if (!Cost->requiresScalarEpilogue(VF))
2988     // If there is an epilogue which must run, there's no edge from the
2989     // middle block to exit blocks  and thus no need to update the immediate
2990     // dominator of the exit blocks.
2991     DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2992 
2993   ReplaceInstWithInst(
2994       TCCheckBlock->getTerminator(),
2995       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2996   LoopBypassBlocks.push_back(TCCheckBlock);
2997 }
2998 
2999 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
3000   BasicBlock *const SCEVCheckBlock =
3001       RTChecks.emitSCEVChecks(Bypass, LoopVectorPreHeader, LoopExitBlock);
3002   if (!SCEVCheckBlock)
3003     return nullptr;
3004 
3005   assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
3006            (OptForSizeBasedOnProfile &&
3007             Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
3008          "Cannot SCEV check stride or overflow when optimizing for size");
3009 
3010 
3011   // Update dominator only if this is first RT check.
3012   if (LoopBypassBlocks.empty()) {
3013     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3014     if (!Cost->requiresScalarEpilogue(VF))
3015       // If there is an epilogue which must run, there's no edge from the
3016       // middle block to exit blocks  and thus no need to update the immediate
3017       // dominator of the exit blocks.
3018       DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3019   }
3020 
3021   LoopBypassBlocks.push_back(SCEVCheckBlock);
3022   AddedSafetyChecks = true;
3023   return SCEVCheckBlock;
3024 }
3025 
3026 BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
3027   // VPlan-native path does not do any analysis for runtime checks currently.
3028   if (EnableVPlanNativePath)
3029     return nullptr;
3030 
3031   BasicBlock *const MemCheckBlock =
3032       RTChecks.emitMemRuntimeChecks(Bypass, LoopVectorPreHeader);
3033 
3034   // Check if we generated code that checks in runtime if arrays overlap. We put
3035   // the checks into a separate block to make the more common case of few
3036   // elements faster.
3037   if (!MemCheckBlock)
3038     return nullptr;
3039 
3040   if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
3041     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
3042            "Cannot emit memory checks when optimizing for size, unless forced "
3043            "to vectorize.");
3044     ORE->emit([&]() {
3045       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
3046                                         OrigLoop->getStartLoc(),
3047                                         OrigLoop->getHeader())
3048              << "Code-size may be reduced by not forcing "
3049                 "vectorization, or by source-code modifications "
3050                 "eliminating the need for runtime checks "
3051                 "(e.g., adding 'restrict').";
3052     });
3053   }
3054 
3055   LoopBypassBlocks.push_back(MemCheckBlock);
3056 
3057   AddedSafetyChecks = true;
3058 
3059   return MemCheckBlock;
3060 }
3061 
3062 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3063   LoopScalarBody = OrigLoop->getHeader();
3064   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3065   assert(LoopVectorPreHeader && "Invalid loop structure");
3066   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3067   assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
3068          "multiple exit loop without required epilogue?");
3069 
3070   LoopMiddleBlock =
3071       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
3072                  LI, nullptr, Twine(Prefix) + "middle.block");
3073   LoopScalarPreHeader =
3074       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
3075                  nullptr, Twine(Prefix) + "scalar.ph");
3076 
3077   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3078 
3079   // Set up the middle block terminator.  Two cases:
3080   // 1) If we know that we must execute the scalar epilogue, emit an
3081   //    unconditional branch.
3082   // 2) Otherwise, we must have a single unique exit block (due to how we
3083   //    implement the multiple exit case).  In this case, set up a conditonal
3084   //    branch from the middle block to the loop scalar preheader, and the
3085   //    exit block.  completeLoopSkeleton will update the condition to use an
3086   //    iteration check, if required to decide whether to execute the remainder.
3087   BranchInst *BrInst = Cost->requiresScalarEpilogue(VF) ?
3088     BranchInst::Create(LoopScalarPreHeader) :
3089     BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3090                        Builder.getTrue());
3091   BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3092   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3093 
3094   // Update dominator for loop exit. During skeleton creation, only the vector
3095   // pre-header and the middle block are created. The vector loop is entirely
3096   // created during VPlan exection.
3097   if (!Cost->requiresScalarEpilogue(VF))
3098     // If there is an epilogue which must run, there's no edge from the
3099     // middle block to exit blocks  and thus no need to update the immediate
3100     // dominator of the exit blocks.
3101     DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3102 }
3103 
3104 void InnerLoopVectorizer::createInductionResumeValues(
3105     std::pair<BasicBlock *, Value *> AdditionalBypass) {
3106   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
3107           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
3108          "Inconsistent information about additional bypass.");
3109 
3110   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3111   assert(VectorTripCount && "Expected valid arguments");
3112   // We are going to resume the execution of the scalar loop.
3113   // Go over all of the induction variables that we found and fix the
3114   // PHIs that are left in the scalar version of the loop.
3115   // The starting values of PHI nodes depend on the counter of the last
3116   // iteration in the vectorized loop.
3117   // If we come from a bypass edge then we need to start from the original
3118   // start value.
3119   Instruction *OldInduction = Legal->getPrimaryInduction();
3120   for (auto &InductionEntry : Legal->getInductionVars()) {
3121     PHINode *OrigPhi = InductionEntry.first;
3122     InductionDescriptor II = InductionEntry.second;
3123 
3124     Value *&EndValue = IVEndValues[OrigPhi];
3125     Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
3126     if (OrigPhi == OldInduction) {
3127       // We know what the end value is.
3128       EndValue = VectorTripCount;
3129     } else {
3130       IRBuilder<> B(LoopVectorPreHeader->getTerminator());
3131 
3132       // Fast-math-flags propagate from the original induction instruction.
3133       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3134         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3135 
3136       Type *StepType = II.getStep()->getType();
3137       Instruction::CastOps CastOp =
3138           CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
3139       Value *VTC = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.vtc");
3140       Value *Step =
3141           CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3142       EndValue = emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3143       EndValue->setName("ind.end");
3144 
3145       // Compute the end value for the additional bypass (if applicable).
3146       if (AdditionalBypass.first) {
3147         B.SetInsertPoint(&(*AdditionalBypass.first->getFirstInsertionPt()));
3148         CastOp = CastInst::getCastOpcode(AdditionalBypass.second, true,
3149                                          StepType, true);
3150         Value *Step =
3151             CreateStepValue(II.getStep(), *PSE.getSE(), &*B.GetInsertPoint());
3152         VTC =
3153             B.CreateCast(CastOp, AdditionalBypass.second, StepType, "cast.vtc");
3154         EndValueFromAdditionalBypass =
3155             emitTransformedIndex(B, VTC, II.getStartValue(), Step, II);
3156         EndValueFromAdditionalBypass->setName("ind.end");
3157       }
3158     }
3159 
3160     // Create phi nodes to merge from the  backedge-taken check block.
3161     PHINode *BCResumeVal =
3162         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3163                         LoopScalarPreHeader->getTerminator());
3164     // Copy original phi DL over to the new one.
3165     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3166 
3167     // The new PHI merges the original incoming value, in case of a bypass,
3168     // or the value at the end of the vectorized loop.
3169     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3170 
3171     // Fix the scalar body counter (PHI node).
3172     // The old induction's phi node in the scalar body needs the truncated
3173     // value.
3174     for (BasicBlock *BB : LoopBypassBlocks)
3175       BCResumeVal->addIncoming(II.getStartValue(), BB);
3176 
3177     if (AdditionalBypass.first)
3178       BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
3179                                             EndValueFromAdditionalBypass);
3180 
3181     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3182   }
3183 }
3184 
3185 BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(MDNode *OrigLoopID) {
3186   // The trip counts should be cached by now.
3187   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
3188   Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
3189 
3190   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3191 
3192   // Add a check in the middle block to see if we have completed
3193   // all of the iterations in the first vector loop.  Three cases:
3194   // 1) If we require a scalar epilogue, there is no conditional branch as
3195   //    we unconditionally branch to the scalar preheader.  Do nothing.
3196   // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3197   //    Thus if tail is to be folded, we know we don't need to run the
3198   //    remainder and we can use the previous value for the condition (true).
3199   // 3) Otherwise, construct a runtime check.
3200   if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
3201     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
3202                                         Count, VectorTripCount, "cmp.n",
3203                                         LoopMiddleBlock->getTerminator());
3204 
3205     // Here we use the same DebugLoc as the scalar loop latch terminator instead
3206     // of the corresponding compare because they may have ended up with
3207     // different line numbers and we want to avoid awkward line stepping while
3208     // debugging. Eg. if the compare has got a line number inside the loop.
3209     CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
3210     cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
3211   }
3212 
3213 #ifdef EXPENSIVE_CHECKS
3214   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3215 #endif
3216 
3217   return LoopVectorPreHeader;
3218 }
3219 
3220 std::pair<BasicBlock *, Value *>
3221 InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3222   /*
3223    In this function we generate a new loop. The new loop will contain
3224    the vectorized instructions while the old loop will continue to run the
3225    scalar remainder.
3226 
3227        [ ] <-- loop iteration number check.
3228     /   |
3229    /    v
3230   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
3231   |  /  |
3232   | /   v
3233   ||   [ ]     <-- vector pre header.
3234   |/    |
3235   |     v
3236   |    [  ] \
3237   |    [  ]_|   <-- vector loop (created during VPlan execution).
3238   |     |
3239   |     v
3240   \   -[ ]   <--- middle-block.
3241    \/   |
3242    /\   v
3243    | ->[ ]     <--- new preheader.
3244    |    |
3245  (opt)  v      <-- edge from middle to exit iff epilogue is not required.
3246    |   [ ] \
3247    |   [ ]_|   <-- old scalar loop to handle remainder (scalar epilogue).
3248     \   |
3249      \  v
3250       >[ ]     <-- exit block(s).
3251    ...
3252    */
3253 
3254   // Get the metadata of the original loop before it gets modified.
3255   MDNode *OrigLoopID = OrigLoop->getLoopID();
3256 
3257   // Workaround!  Compute the trip count of the original loop and cache it
3258   // before we start modifying the CFG.  This code has a systemic problem
3259   // wherein it tries to run analysis over partially constructed IR; this is
3260   // wrong, and not simply for SCEV.  The trip count of the original loop
3261   // simply happens to be prone to hitting this in practice.  In theory, we
3262   // can hit the same issue for any SCEV, or ValueTracking query done during
3263   // mutation.  See PR49900.
3264   getOrCreateTripCount(OrigLoop->getLoopPreheader());
3265 
3266   // Create an empty vector loop, and prepare basic blocks for the runtime
3267   // checks.
3268   createVectorLoopSkeleton("");
3269 
3270   // Now, compare the new count to zero. If it is zero skip the vector loop and
3271   // jump to the scalar loop. This check also covers the case where the
3272   // backedge-taken count is uint##_max: adding one to it will overflow leading
3273   // to an incorrect trip count of zero. In this (rare) case we will also jump
3274   // to the scalar loop.
3275   emitIterationCountCheck(LoopScalarPreHeader);
3276 
3277   // Generate the code to check any assumptions that we've made for SCEV
3278   // expressions.
3279   emitSCEVChecks(LoopScalarPreHeader);
3280 
3281   // Generate the code that checks in runtime if arrays overlap. We put the
3282   // checks into a separate block to make the more common case of few elements
3283   // faster.
3284   emitMemRuntimeChecks(LoopScalarPreHeader);
3285 
3286   // Emit phis for the new starting index of the scalar loop.
3287   createInductionResumeValues();
3288 
3289   return {completeLoopSkeleton(OrigLoopID), nullptr};
3290 }
3291 
3292 // Fix up external users of the induction variable. At this point, we are
3293 // in LCSSA form, with all external PHIs that use the IV having one input value,
3294 // coming from the remainder loop. We need those PHIs to also have a correct
3295 // value for the IV when arriving directly from the middle block.
3296 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3297                                        const InductionDescriptor &II,
3298                                        Value *VectorTripCount, Value *EndValue,
3299                                        BasicBlock *MiddleBlock,
3300                                        BasicBlock *VectorHeader, VPlan &Plan) {
3301   // There are two kinds of external IV usages - those that use the value
3302   // computed in the last iteration (the PHI) and those that use the penultimate
3303   // value (the value that feeds into the phi from the loop latch).
3304   // We allow both, but they, obviously, have different values.
3305 
3306   assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
3307 
3308   DenseMap<Value *, Value *> MissingVals;
3309 
3310   // An external user of the last iteration's value should see the value that
3311   // the remainder loop uses to initialize its own IV.
3312   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3313   for (User *U : PostInc->users()) {
3314     Instruction *UI = cast<Instruction>(U);
3315     if (!OrigLoop->contains(UI)) {
3316       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3317       MissingVals[UI] = EndValue;
3318     }
3319   }
3320 
3321   // An external user of the penultimate value need to see EndValue - Step.
3322   // The simplest way to get this is to recompute it from the constituent SCEVs,
3323   // that is Start + (Step * (CRD - 1)).
3324   for (User *U : OrigPhi->users()) {
3325     auto *UI = cast<Instruction>(U);
3326     if (!OrigLoop->contains(UI)) {
3327       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3328 
3329       IRBuilder<> B(MiddleBlock->getTerminator());
3330 
3331       // Fast-math-flags propagate from the original induction instruction.
3332       if (II.getInductionBinOp() && isa<FPMathOperator>(II.getInductionBinOp()))
3333         B.setFastMathFlags(II.getInductionBinOp()->getFastMathFlags());
3334 
3335       Value *CountMinusOne = B.CreateSub(
3336           VectorTripCount, ConstantInt::get(VectorTripCount->getType(), 1));
3337       Value *CMO =
3338           !II.getStep()->getType()->isIntegerTy()
3339               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3340                              II.getStep()->getType())
3341               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3342       CMO->setName("cast.cmo");
3343 
3344       Value *Step = CreateStepValue(II.getStep(), *PSE.getSE(),
3345                                     VectorHeader->getTerminator());
3346       Value *Escape =
3347           emitTransformedIndex(B, CMO, II.getStartValue(), Step, II);
3348       Escape->setName("ind.escape");
3349       MissingVals[UI] = Escape;
3350     }
3351   }
3352 
3353   for (auto &I : MissingVals) {
3354     PHINode *PHI = cast<PHINode>(I.first);
3355     // One corner case we have to handle is two IVs "chasing" each-other,
3356     // that is %IV2 = phi [...], [ %IV1, %latch ]
3357     // In this case, if IV1 has an external use, we need to avoid adding both
3358     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3359     // don't already have an incoming value for the middle block.
3360     if (PHI->getBasicBlockIndex(MiddleBlock) == -1) {
3361       PHI->addIncoming(I.second, MiddleBlock);
3362       Plan.removeLiveOut(PHI);
3363     }
3364   }
3365 }
3366 
3367 namespace {
3368 
3369 struct CSEDenseMapInfo {
3370   static bool canHandle(const Instruction *I) {
3371     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3372            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3373   }
3374 
3375   static inline Instruction *getEmptyKey() {
3376     return DenseMapInfo<Instruction *>::getEmptyKey();
3377   }
3378 
3379   static inline Instruction *getTombstoneKey() {
3380     return DenseMapInfo<Instruction *>::getTombstoneKey();
3381   }
3382 
3383   static unsigned getHashValue(const Instruction *I) {
3384     assert(canHandle(I) && "Unknown instruction!");
3385     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3386                                                            I->value_op_end()));
3387   }
3388 
3389   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3390     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3391         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3392       return LHS == RHS;
3393     return LHS->isIdenticalTo(RHS);
3394   }
3395 };
3396 
3397 } // end anonymous namespace
3398 
3399 ///Perform cse of induction variable instructions.
3400 static void cse(BasicBlock *BB) {
3401   // Perform simple cse.
3402   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3403   for (Instruction &In : llvm::make_early_inc_range(*BB)) {
3404     if (!CSEDenseMapInfo::canHandle(&In))
3405       continue;
3406 
3407     // Check if we can replace this instruction with any of the
3408     // visited instructions.
3409     if (Instruction *V = CSEMap.lookup(&In)) {
3410       In.replaceAllUsesWith(V);
3411       In.eraseFromParent();
3412       continue;
3413     }
3414 
3415     CSEMap[&In] = &In;
3416   }
3417 }
3418 
3419 InstructionCost
3420 LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
3421                                               bool &NeedToScalarize) const {
3422   Function *F = CI->getCalledFunction();
3423   Type *ScalarRetTy = CI->getType();
3424   SmallVector<Type *, 4> Tys, ScalarTys;
3425   for (auto &ArgOp : CI->args())
3426     ScalarTys.push_back(ArgOp->getType());
3427 
3428   // Estimate cost of scalarized vector call. The source operands are assumed
3429   // to be vectors, so we need to extract individual elements from there,
3430   // execute VF scalar calls, and then gather the result into the vector return
3431   // value.
3432   InstructionCost ScalarCallCost =
3433       TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
3434   if (VF.isScalar())
3435     return ScalarCallCost;
3436 
3437   // Compute corresponding vector type for return value and arguments.
3438   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3439   for (Type *ScalarTy : ScalarTys)
3440     Tys.push_back(ToVectorTy(ScalarTy, VF));
3441 
3442   // Compute costs of unpacking argument values for the scalar calls and
3443   // packing the return values to a vector.
3444   InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
3445 
3446   InstructionCost Cost =
3447       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
3448 
3449   // If we can't emit a vector call for this function, then the currently found
3450   // cost is the cost we need to return.
3451   NeedToScalarize = true;
3452   VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
3453   Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
3454 
3455   if (!TLI || CI->isNoBuiltin() || !VecFunc)
3456     return Cost;
3457 
3458   // If the corresponding vector cost is cheaper, return its cost.
3459   InstructionCost VectorCallCost =
3460       TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
3461   if (VectorCallCost < Cost) {
3462     NeedToScalarize = false;
3463     Cost = VectorCallCost;
3464   }
3465   return Cost;
3466 }
3467 
3468 static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) {
3469   if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy()))
3470     return Elt;
3471   return VectorType::get(Elt, VF);
3472 }
3473 
3474 InstructionCost
3475 LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3476                                                    ElementCount VF) const {
3477   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3478   assert(ID && "Expected intrinsic call!");
3479   Type *RetTy = MaybeVectorizeType(CI->getType(), VF);
3480   FastMathFlags FMF;
3481   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3482     FMF = FPMO->getFastMathFlags();
3483 
3484   SmallVector<const Value *> Arguments(CI->args());
3485   FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
3486   SmallVector<Type *> ParamTys;
3487   std::transform(FTy->param_begin(), FTy->param_end(),
3488                  std::back_inserter(ParamTys),
3489                  [&](Type *Ty) { return MaybeVectorizeType(Ty, VF); });
3490 
3491   IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
3492                                     dyn_cast<IntrinsicInst>(CI));
3493   return TTI.getIntrinsicInstrCost(CostAttrs,
3494                                    TargetTransformInfo::TCK_RecipThroughput);
3495 }
3496 
3497 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3498   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3499   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3500   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3501 }
3502 
3503 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3504   auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
3505   auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
3506   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3507 }
3508 
3509 void InnerLoopVectorizer::truncateToMinimalBitwidths(VPTransformState &State) {
3510   // For every instruction `I` in MinBWs, truncate the operands, create a
3511   // truncated version of `I` and reextend its result. InstCombine runs
3512   // later and will remove any ext/trunc pairs.
3513   SmallPtrSet<Value *, 4> Erased;
3514   for (const auto &KV : Cost->getMinimalBitwidths()) {
3515     // If the value wasn't vectorized, we must maintain the original scalar
3516     // type. The absence of the value from State indicates that it
3517     // wasn't vectorized.
3518     // FIXME: Should not rely on getVPValue at this point.
3519     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3520     if (!State.hasAnyVectorValue(Def))
3521       continue;
3522     for (unsigned Part = 0; Part < UF; ++Part) {
3523       Value *I = State.get(Def, Part);
3524       if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
3525         continue;
3526       Type *OriginalTy = I->getType();
3527       Type *ScalarTruncatedTy =
3528           IntegerType::get(OriginalTy->getContext(), KV.second);
3529       auto *TruncatedTy = VectorType::get(
3530           ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getElementCount());
3531       if (TruncatedTy == OriginalTy)
3532         continue;
3533 
3534       IRBuilder<> B(cast<Instruction>(I));
3535       auto ShrinkOperand = [&](Value *V) -> Value * {
3536         if (auto *ZI = dyn_cast<ZExtInst>(V))
3537           if (ZI->getSrcTy() == TruncatedTy)
3538             return ZI->getOperand(0);
3539         return B.CreateZExtOrTrunc(V, TruncatedTy);
3540       };
3541 
3542       // The actual instruction modification depends on the instruction type,
3543       // unfortunately.
3544       Value *NewI = nullptr;
3545       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3546         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3547                              ShrinkOperand(BO->getOperand(1)));
3548 
3549         // Any wrapping introduced by shrinking this operation shouldn't be
3550         // considered undefined behavior. So, we can't unconditionally copy
3551         // arithmetic wrapping flags to NewI.
3552         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3553       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3554         NewI =
3555             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3556                          ShrinkOperand(CI->getOperand(1)));
3557       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3558         NewI = B.CreateSelect(SI->getCondition(),
3559                               ShrinkOperand(SI->getTrueValue()),
3560                               ShrinkOperand(SI->getFalseValue()));
3561       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3562         switch (CI->getOpcode()) {
3563         default:
3564           llvm_unreachable("Unhandled cast!");
3565         case Instruction::Trunc:
3566           NewI = ShrinkOperand(CI->getOperand(0));
3567           break;
3568         case Instruction::SExt:
3569           NewI = B.CreateSExtOrTrunc(
3570               CI->getOperand(0),
3571               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3572           break;
3573         case Instruction::ZExt:
3574           NewI = B.CreateZExtOrTrunc(
3575               CI->getOperand(0),
3576               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3577           break;
3578         }
3579       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3580         auto Elements0 =
3581             cast<VectorType>(SI->getOperand(0)->getType())->getElementCount();
3582         auto *O0 = B.CreateZExtOrTrunc(
3583             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3584         auto Elements1 =
3585             cast<VectorType>(SI->getOperand(1)->getType())->getElementCount();
3586         auto *O1 = B.CreateZExtOrTrunc(
3587             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3588 
3589         NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
3590       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3591         // Don't do anything with the operands, just extend the result.
3592         continue;
3593       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3594         auto Elements =
3595             cast<VectorType>(IE->getOperand(0)->getType())->getElementCount();
3596         auto *O0 = B.CreateZExtOrTrunc(
3597             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3598         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3599         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3600       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3601         auto Elements =
3602             cast<VectorType>(EE->getOperand(0)->getType())->getElementCount();
3603         auto *O0 = B.CreateZExtOrTrunc(
3604             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3605         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3606       } else {
3607         // If we don't know what to do, be conservative and don't do anything.
3608         continue;
3609       }
3610 
3611       // Lastly, extend the result.
3612       NewI->takeName(cast<Instruction>(I));
3613       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3614       I->replaceAllUsesWith(Res);
3615       cast<Instruction>(I)->eraseFromParent();
3616       Erased.insert(I);
3617       State.reset(Def, Res, Part);
3618     }
3619   }
3620 
3621   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3622   for (const auto &KV : Cost->getMinimalBitwidths()) {
3623     // If the value wasn't vectorized, we must maintain the original scalar
3624     // type. The absence of the value from State indicates that it
3625     // wasn't vectorized.
3626     // FIXME: Should not rely on getVPValue at this point.
3627     VPValue *Def = State.Plan->getVPValue(KV.first, true);
3628     if (!State.hasAnyVectorValue(Def))
3629       continue;
3630     for (unsigned Part = 0; Part < UF; ++Part) {
3631       Value *I = State.get(Def, Part);
3632       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3633       if (Inst && Inst->use_empty()) {
3634         Value *NewI = Inst->getOperand(0);
3635         Inst->eraseFromParent();
3636         State.reset(Def, NewI, Part);
3637       }
3638     }
3639   }
3640 }
3641 
3642 void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3643                                             VPlan &Plan) {
3644   // Insert truncates and extends for any truncated instructions as hints to
3645   // InstCombine.
3646   if (VF.isVector())
3647     truncateToMinimalBitwidths(State);
3648 
3649   // Fix widened non-induction PHIs by setting up the PHI operands.
3650   if (EnableVPlanNativePath)
3651     fixNonInductionPHIs(Plan, State);
3652 
3653   // At this point every instruction in the original loop is widened to a
3654   // vector form. Now we need to fix the recurrences in the loop. These PHI
3655   // nodes are currently empty because we did not want to introduce cycles.
3656   // This is the second stage of vectorizing recurrences.
3657   fixCrossIterationPHIs(State);
3658 
3659   // Forget the original basic block.
3660   PSE.getSE()->forgetLoop(OrigLoop);
3661 
3662   VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
3663   Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3664   if (Cost->requiresScalarEpilogue(VF)) {
3665     // No edge from the middle block to the unique exit block has been inserted
3666     // and there is nothing to fix from vector loop; phis should have incoming
3667     // from scalar loop only.
3668     Plan.clearLiveOuts();
3669   } else {
3670     // If we inserted an edge from the middle block to the unique exit block,
3671     // update uses outside the loop (phis) to account for the newly inserted
3672     // edge.
3673 
3674     // Fix-up external users of the induction variables.
3675     for (auto &Entry : Legal->getInductionVars())
3676       fixupIVUsers(Entry.first, Entry.second,
3677                    getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
3678                    IVEndValues[Entry.first], LoopMiddleBlock,
3679                    VectorLoop->getHeader(), Plan);
3680   }
3681 
3682   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
3683   // in the exit block, so update the builder.
3684   State.Builder.SetInsertPoint(State.CFG.ExitBB->getFirstNonPHI());
3685   for (auto &KV : Plan.getLiveOuts())
3686     KV.second->fixPhi(Plan, State);
3687 
3688   for (Instruction *PI : PredicatedInstructions)
3689     sinkScalarOperands(&*PI);
3690 
3691   // Remove redundant induction instructions.
3692   cse(VectorLoop->getHeader());
3693 
3694   // Set/update profile weights for the vector and remainder loops as original
3695   // loop iterations are now distributed among them. Note that original loop
3696   // represented by LoopScalarBody becomes remainder loop after vectorization.
3697   //
3698   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
3699   // end up getting slightly roughened result but that should be OK since
3700   // profile is not inherently precise anyway. Note also possible bypass of
3701   // vector code caused by legality checks is ignored, assigning all the weight
3702   // to the vector loop, optimistically.
3703   //
3704   // For scalable vectorization we can't know at compile time how many iterations
3705   // of the loop are handled in one vector iteration, so instead assume a pessimistic
3706   // vscale of '1'.
3707   setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody), VectorLoop,
3708                                LI->getLoopFor(LoopScalarBody),
3709                                VF.getKnownMinValue() * UF);
3710 }
3711 
3712 void InnerLoopVectorizer::fixCrossIterationPHIs(VPTransformState &State) {
3713   // In order to support recurrences we need to be able to vectorize Phi nodes.
3714   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3715   // stage #2: We now need to fix the recurrences by adding incoming edges to
3716   // the currently empty PHI nodes. At this point every instruction in the
3717   // original loop is widened to a vector form so we can use them to construct
3718   // the incoming edges.
3719   VPBasicBlock *Header =
3720       State.Plan->getVectorLoopRegion()->getEntryBasicBlock();
3721   for (VPRecipeBase &R : Header->phis()) {
3722     if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R))
3723       fixReduction(ReductionPhi, State);
3724     else if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R))
3725       fixFirstOrderRecurrence(FOR, State);
3726   }
3727 }
3728 
3729 void InnerLoopVectorizer::fixFirstOrderRecurrence(
3730     VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
3731   // This is the second phase of vectorizing first-order recurrences. An
3732   // overview of the transformation is described below. Suppose we have the
3733   // following loop.
3734   //
3735   //   for (int i = 0; i < n; ++i)
3736   //     b[i] = a[i] - a[i - 1];
3737   //
3738   // There is a first-order recurrence on "a". For this loop, the shorthand
3739   // scalar IR looks like:
3740   //
3741   //   scalar.ph:
3742   //     s_init = a[-1]
3743   //     br scalar.body
3744   //
3745   //   scalar.body:
3746   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3747   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3748   //     s2 = a[i]
3749   //     b[i] = s2 - s1
3750   //     br cond, scalar.body, ...
3751   //
3752   // In this example, s1 is a recurrence because it's value depends on the
3753   // previous iteration. In the first phase of vectorization, we created a
3754   // vector phi v1 for s1. We now complete the vectorization and produce the
3755   // shorthand vector IR shown below (for VF = 4, UF = 1).
3756   //
3757   //   vector.ph:
3758   //     v_init = vector(..., ..., ..., a[-1])
3759   //     br vector.body
3760   //
3761   //   vector.body
3762   //     i = phi [0, vector.ph], [i+4, vector.body]
3763   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3764   //     v2 = a[i, i+1, i+2, i+3];
3765   //     v3 = vector(v1(3), v2(0, 1, 2))
3766   //     b[i, i+1, i+2, i+3] = v2 - v3
3767   //     br cond, vector.body, middle.block
3768   //
3769   //   middle.block:
3770   //     x = v2(3)
3771   //     br scalar.ph
3772   //
3773   //   scalar.ph:
3774   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3775   //     br scalar.body
3776   //
3777   // After execution completes the vector loop, we extract the next value of
3778   // the recurrence (x) to use as the initial value in the scalar loop.
3779 
3780   // Extract the last vector element in the middle block. This will be the
3781   // initial value for the recurrence when jumping to the scalar loop.
3782   VPValue *PreviousDef = PhiR->getBackedgeValue();
3783   Value *Incoming = State.get(PreviousDef, UF - 1);
3784   auto *ExtractForScalar = Incoming;
3785   auto *IdxTy = Builder.getInt32Ty();
3786   if (VF.isVector()) {
3787     auto *One = ConstantInt::get(IdxTy, 1);
3788     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3789     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3790     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
3791     ExtractForScalar = Builder.CreateExtractElement(ExtractForScalar, LastIdx,
3792                                                     "vector.recur.extract");
3793   }
3794   // Extract the second last element in the middle block if the
3795   // Phi is used outside the loop. We need to extract the phi itself
3796   // and not the last element (the phi update in the current iteration). This
3797   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3798   // when the scalar loop is not run at all.
3799   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3800   if (VF.isVector()) {
3801     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, VF);
3802     auto *Idx = Builder.CreateSub(RuntimeVF, ConstantInt::get(IdxTy, 2));
3803     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3804         Incoming, Idx, "vector.recur.extract.for.phi");
3805   } else if (UF > 1)
3806     // When loop is unrolled without vectorizing, initialize
3807     // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value
3808     // of `Incoming`. This is analogous to the vectorized case above: extracting
3809     // the second last element when VF > 1.
3810     ExtractForPhiUsedOutsideLoop = State.get(PreviousDef, UF - 2);
3811 
3812   // Fix the initial value of the original recurrence in the scalar loop.
3813   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3814   PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
3815   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3816   auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
3817   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3818     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3819     Start->addIncoming(Incoming, BB);
3820   }
3821 
3822   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3823   Phi->setName("scalar.recur");
3824 
3825   // Finally, fix users of the recurrence outside the loop. The users will need
3826   // either the last value of the scalar recurrence or the last value of the
3827   // vector recurrence we extracted in the middle block. Since the loop is in
3828   // LCSSA form, we just need to find all the phi nodes for the original scalar
3829   // recurrence in the exit block, and then add an edge for the middle block.
3830   // Note that LCSSA does not imply single entry when the original scalar loop
3831   // had multiple exiting edges (as we always run the last iteration in the
3832   // scalar epilogue); in that case, there is no edge from middle to exit and
3833   // and thus no phis which needed updated.
3834   if (!Cost->requiresScalarEpilogue(VF))
3835     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
3836       if (llvm::is_contained(LCSSAPhi.incoming_values(), Phi)) {
3837         LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3838         State.Plan->removeLiveOut(&LCSSAPhi);
3839       }
3840 }
3841 
3842 void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
3843                                        VPTransformState &State) {
3844   PHINode *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
3845   // Get it's reduction variable descriptor.
3846   assert(Legal->isReductionVariable(OrigPhi) &&
3847          "Unable to find the reduction variable");
3848   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
3849 
3850   RecurKind RK = RdxDesc.getRecurrenceKind();
3851   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3852   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3853   State.setDebugLocFromInst(ReductionStartValue);
3854 
3855   VPValue *LoopExitInstDef = PhiR->getBackedgeValue();
3856   // This is the vector-clone of the value that leaves the loop.
3857   Type *VecTy = State.get(LoopExitInstDef, 0)->getType();
3858 
3859   // Wrap flags are in general invalid after vectorization, clear them.
3860   clearReductionWrapFlags(PhiR, State);
3861 
3862   // Before each round, move the insertion point right between
3863   // the PHIs and the values we are going to write.
3864   // This allows us to write both PHINodes and the extractelement
3865   // instructions.
3866   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3867 
3868   State.setDebugLocFromInst(LoopExitInst);
3869 
3870   Type *PhiTy = OrigPhi->getType();
3871 
3872   VPBasicBlock *LatchVPBB =
3873       PhiR->getParent()->getEnclosingLoopRegion()->getExitingBasicBlock();
3874   BasicBlock *VectorLoopLatch = State.CFG.VPBB2IRBB[LatchVPBB];
3875   // If tail is folded by masking, the vector value to leave the loop should be
3876   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3877   // instead of the former. For an inloop reduction the reduction will already
3878   // be predicated, and does not need to be handled here.
3879   if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
3880     for (unsigned Part = 0; Part < UF; ++Part) {
3881       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
3882       SelectInst *Sel = nullptr;
3883       for (User *U : VecLoopExitInst->users()) {
3884         if (isa<SelectInst>(U)) {
3885           assert(!Sel && "Reduction exit feeding two selects");
3886           Sel = cast<SelectInst>(U);
3887         } else
3888           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3889       }
3890       assert(Sel && "Reduction exit feeds no select");
3891       State.reset(LoopExitInstDef, Sel, Part);
3892 
3893       if (isa<FPMathOperator>(Sel))
3894         Sel->setFastMathFlags(RdxDesc.getFastMathFlags());
3895 
3896       // If the target can create a predicated operator for the reduction at no
3897       // extra cost in the loop (for example a predicated vadd), it can be
3898       // cheaper for the select to remain in the loop than be sunk out of it,
3899       // and so use the select value for the phi instead of the old
3900       // LoopExitValue.
3901       if (PreferPredicatedReductionSelect ||
3902           TTI->preferPredicatedReductionSelect(
3903               RdxDesc.getOpcode(), PhiTy,
3904               TargetTransformInfo::ReductionFlags())) {
3905         auto *VecRdxPhi =
3906             cast<PHINode>(State.get(PhiR, Part));
3907         VecRdxPhi->setIncomingValueForBlock(VectorLoopLatch, Sel);
3908       }
3909     }
3910   }
3911 
3912   // If the vector reduction can be performed in a smaller type, we truncate
3913   // then extend the loop exit value to enable InstCombine to evaluate the
3914   // entire expression in the smaller type.
3915   if (VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
3916     assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
3917     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3918     Builder.SetInsertPoint(VectorLoopLatch->getTerminator());
3919     VectorParts RdxParts(UF);
3920     for (unsigned Part = 0; Part < UF; ++Part) {
3921       RdxParts[Part] = State.get(LoopExitInstDef, Part);
3922       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3923       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3924                                         : Builder.CreateZExt(Trunc, VecTy);
3925       for (User *U : llvm::make_early_inc_range(RdxParts[Part]->users()))
3926         if (U != Trunc) {
3927           U->replaceUsesOfWith(RdxParts[Part], Extnd);
3928           RdxParts[Part] = Extnd;
3929         }
3930     }
3931     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3932     for (unsigned Part = 0; Part < UF; ++Part) {
3933       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3934       State.reset(LoopExitInstDef, RdxParts[Part], Part);
3935     }
3936   }
3937 
3938   // Reduce all of the unrolled parts into a single vector.
3939   Value *ReducedPartRdx = State.get(LoopExitInstDef, 0);
3940   unsigned Op = RecurrenceDescriptor::getOpcode(RK);
3941 
3942   // The middle block terminator has already been assigned a DebugLoc here (the
3943   // OrigLoop's single latch terminator). We want the whole middle block to
3944   // appear to execute on this line because: (a) it is all compiler generated,
3945   // (b) these instructions are always executed after evaluating the latch
3946   // conditional branch, and (c) other passes may add new predecessors which
3947   // terminate on this line. This is the easiest way to ensure we don't
3948   // accidentally cause an extra step back into the loop while debugging.
3949   State.setDebugLocFromInst(LoopMiddleBlock->getTerminator());
3950   if (PhiR->isOrdered())
3951     ReducedPartRdx = State.get(LoopExitInstDef, UF - 1);
3952   else {
3953     // Floating-point operations should have some FMF to enable the reduction.
3954     IRBuilderBase::FastMathFlagGuard FMFG(Builder);
3955     Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
3956     for (unsigned Part = 1; Part < UF; ++Part) {
3957       Value *RdxPart = State.get(LoopExitInstDef, Part);
3958       if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
3959         ReducedPartRdx = Builder.CreateBinOp(
3960             (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
3961       } else if (RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK))
3962         ReducedPartRdx = createSelectCmpOp(Builder, ReductionStartValue, RK,
3963                                            ReducedPartRdx, RdxPart);
3964       else
3965         ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
3966     }
3967   }
3968 
3969   // Create the reduction after the loop. Note that inloop reductions create the
3970   // target reduction in the loop using a Reduction recipe.
3971   if (VF.isVector() && !PhiR->isInLoop()) {
3972     ReducedPartRdx =
3973         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, OrigPhi);
3974     // If the reduction can be performed in a smaller type, we need to extend
3975     // the reduction to the wider type before we branch to the original loop.
3976     if (PhiTy != RdxDesc.getRecurrenceType())
3977       ReducedPartRdx = RdxDesc.isSigned()
3978                            ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
3979                            : Builder.CreateZExt(ReducedPartRdx, PhiTy);
3980   }
3981 
3982   PHINode *ResumePhi =
3983       dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
3984 
3985   // Create a phi node that merges control-flow from the backedge-taken check
3986   // block and the middle block.
3987   PHINode *BCBlockPhi = PHINode::Create(PhiTy, 2, "bc.merge.rdx",
3988                                         LoopScalarPreHeader->getTerminator());
3989 
3990   // If we are fixing reductions in the epilogue loop then we should already
3991   // have created a bc.merge.rdx Phi after the main vector body. Ensure that
3992   // we carry over the incoming values correctly.
3993   for (auto *Incoming : predecessors(LoopScalarPreHeader)) {
3994     if (Incoming == LoopMiddleBlock)
3995       BCBlockPhi->addIncoming(ReducedPartRdx, Incoming);
3996     else if (ResumePhi && llvm::is_contained(ResumePhi->blocks(), Incoming))
3997       BCBlockPhi->addIncoming(ResumePhi->getIncomingValueForBlock(Incoming),
3998                               Incoming);
3999     else
4000       BCBlockPhi->addIncoming(ReductionStartValue, Incoming);
4001   }
4002 
4003   // Set the resume value for this reduction
4004   ReductionResumeValues.insert({&RdxDesc, BCBlockPhi});
4005 
4006   // If there were stores of the reduction value to a uniform memory address
4007   // inside the loop, create the final store here.
4008   if (StoreInst *SI = RdxDesc.IntermediateStore) {
4009     StoreInst *NewSI =
4010         Builder.CreateStore(ReducedPartRdx, SI->getPointerOperand());
4011     propagateMetadata(NewSI, SI);
4012 
4013     // If the reduction value is used in other places,
4014     // then let the code below create PHI's for that.
4015   }
4016 
4017   // Now, we need to fix the users of the reduction variable
4018   // inside and outside of the scalar remainder loop.
4019 
4020   // We know that the loop is in LCSSA form. We need to update the PHI nodes
4021   // in the exit blocks.  See comment on analogous loop in
4022   // fixFirstOrderRecurrence for a more complete explaination of the logic.
4023   if (!Cost->requiresScalarEpilogue(VF))
4024     for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4025       if (llvm::is_contained(LCSSAPhi.incoming_values(), LoopExitInst)) {
4026         LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4027         State.Plan->removeLiveOut(&LCSSAPhi);
4028       }
4029 
4030   // Fix the scalar loop reduction variable with the incoming reduction sum
4031   // from the vector body and from the backedge value.
4032   int IncomingEdgeBlockIdx =
4033       OrigPhi->getBasicBlockIndex(OrigLoop->getLoopLatch());
4034   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
4035   // Pick the other block.
4036   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
4037   OrigPhi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
4038   OrigPhi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
4039 }
4040 
4041 void InnerLoopVectorizer::clearReductionWrapFlags(VPReductionPHIRecipe *PhiR,
4042                                                   VPTransformState &State) {
4043   const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
4044   RecurKind RK = RdxDesc.getRecurrenceKind();
4045   if (RK != RecurKind::Add && RK != RecurKind::Mul)
4046     return;
4047 
4048   SmallVector<VPValue *, 8> Worklist;
4049   SmallPtrSet<VPValue *, 8> Visited;
4050   Worklist.push_back(PhiR);
4051   Visited.insert(PhiR);
4052 
4053   while (!Worklist.empty()) {
4054     VPValue *Cur = Worklist.pop_back_val();
4055     for (unsigned Part = 0; Part < UF; ++Part) {
4056       Value *V = State.get(Cur, Part);
4057       if (!isa<OverflowingBinaryOperator>(V))
4058         break;
4059       cast<Instruction>(V)->dropPoisonGeneratingFlags();
4060       }
4061 
4062       for (VPUser *U : Cur->users()) {
4063         auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
4064         if (!UserRecipe)
4065           continue;
4066         for (VPValue *V : UserRecipe->definedValues())
4067           if (Visited.insert(V).second)
4068             Worklist.push_back(V);
4069       }
4070   }
4071 }
4072 
4073 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
4074   // The basic block and loop containing the predicated instruction.
4075   auto *PredBB = PredInst->getParent();
4076   auto *VectorLoop = LI->getLoopFor(PredBB);
4077 
4078   // Initialize a worklist with the operands of the predicated instruction.
4079   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
4080 
4081   // Holds instructions that we need to analyze again. An instruction may be
4082   // reanalyzed if we don't yet know if we can sink it or not.
4083   SmallVector<Instruction *, 8> InstsToReanalyze;
4084 
4085   // Returns true if a given use occurs in the predicated block. Phi nodes use
4086   // their operands in their corresponding predecessor blocks.
4087   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
4088     auto *I = cast<Instruction>(U.getUser());
4089     BasicBlock *BB = I->getParent();
4090     if (auto *Phi = dyn_cast<PHINode>(I))
4091       BB = Phi->getIncomingBlock(
4092           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
4093     return BB == PredBB;
4094   };
4095 
4096   // Iteratively sink the scalarized operands of the predicated instruction
4097   // into the block we created for it. When an instruction is sunk, it's
4098   // operands are then added to the worklist. The algorithm ends after one pass
4099   // through the worklist doesn't sink a single instruction.
4100   bool Changed;
4101   do {
4102     // Add the instructions that need to be reanalyzed to the worklist, and
4103     // reset the changed indicator.
4104     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
4105     InstsToReanalyze.clear();
4106     Changed = false;
4107 
4108     while (!Worklist.empty()) {
4109       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
4110 
4111       // We can't sink an instruction if it is a phi node, is not in the loop,
4112       // or may have side effects.
4113       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
4114           I->mayHaveSideEffects())
4115         continue;
4116 
4117       // If the instruction is already in PredBB, check if we can sink its
4118       // operands. In that case, VPlan's sinkScalarOperands() succeeded in
4119       // sinking the scalar instruction I, hence it appears in PredBB; but it
4120       // may have failed to sink I's operands (recursively), which we try
4121       // (again) here.
4122       if (I->getParent() == PredBB) {
4123         Worklist.insert(I->op_begin(), I->op_end());
4124         continue;
4125       }
4126 
4127       // It's legal to sink the instruction if all its uses occur in the
4128       // predicated block. Otherwise, there's nothing to do yet, and we may
4129       // need to reanalyze the instruction.
4130       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
4131         InstsToReanalyze.push_back(I);
4132         continue;
4133       }
4134 
4135       // Move the instruction to the beginning of the predicated block, and add
4136       // it's operands to the worklist.
4137       I->moveBefore(&*PredBB->getFirstInsertionPt());
4138       Worklist.insert(I->op_begin(), I->op_end());
4139 
4140       // The sinking may have enabled other instructions to be sunk, so we will
4141       // need to iterate.
4142       Changed = true;
4143     }
4144   } while (Changed);
4145 }
4146 
4147 void InnerLoopVectorizer::fixNonInductionPHIs(VPlan &Plan,
4148                                               VPTransformState &State) {
4149   auto Iter = depth_first(
4150       VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
4151   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
4152     for (VPRecipeBase &P : VPBB->phis()) {
4153       VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(&P);
4154       if (!VPPhi)
4155         continue;
4156       PHINode *NewPhi = cast<PHINode>(State.get(VPPhi, 0));
4157       // Make sure the builder has a valid insert point.
4158       Builder.SetInsertPoint(NewPhi);
4159       for (unsigned i = 0; i < VPPhi->getNumOperands(); ++i) {
4160         VPValue *Inc = VPPhi->getIncomingValue(i);
4161         VPBasicBlock *VPBB = VPPhi->getIncomingBlock(i);
4162         NewPhi->addIncoming(State.get(Inc, 0), State.CFG.VPBB2IRBB[VPBB]);
4163       }
4164     }
4165   }
4166 }
4167 
4168 bool InnerLoopVectorizer::useOrderedReductions(
4169     const RecurrenceDescriptor &RdxDesc) {
4170   return Cost->useOrderedReductions(RdxDesc);
4171 }
4172 
4173 void InnerLoopVectorizer::widenCallInstruction(CallInst &CI, VPValue *Def,
4174                                                VPUser &ArgOperands,
4175                                                VPTransformState &State) {
4176   assert(!isa<DbgInfoIntrinsic>(CI) &&
4177          "DbgInfoIntrinsic should have been dropped during VPlan construction");
4178   State.setDebugLocFromInst(&CI);
4179 
4180   SmallVector<Type *, 4> Tys;
4181   for (Value *ArgOperand : CI.args())
4182     Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
4183 
4184   Intrinsic::ID ID = getVectorIntrinsicIDForCall(&CI, TLI);
4185 
4186   // The flag shows whether we use Intrinsic or a usual Call for vectorized
4187   // version of the instruction.
4188   // Is it beneficial to perform intrinsic call compared to lib call?
4189   bool NeedToScalarize = false;
4190   InstructionCost CallCost = Cost->getVectorCallCost(&CI, VF, NeedToScalarize);
4191   InstructionCost IntrinsicCost =
4192       ID ? Cost->getVectorIntrinsicCost(&CI, VF) : 0;
4193   bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
4194   assert((UseVectorIntrinsic || !NeedToScalarize) &&
4195          "Instruction should be scalarized elsewhere.");
4196   assert((IntrinsicCost.isValid() || CallCost.isValid()) &&
4197          "Either the intrinsic cost or vector call cost must be valid");
4198 
4199   for (unsigned Part = 0; Part < UF; ++Part) {
4200     SmallVector<Type *, 2> TysForDecl = {CI.getType()};
4201     SmallVector<Value *, 4> Args;
4202     for (auto &I : enumerate(ArgOperands.operands())) {
4203       // Some intrinsics have a scalar argument - don't replace it with a
4204       // vector.
4205       Value *Arg;
4206       if (!UseVectorIntrinsic ||
4207           !isVectorIntrinsicWithScalarOpAtArg(ID, I.index()))
4208         Arg = State.get(I.value(), Part);
4209       else
4210         Arg = State.get(I.value(), VPIteration(0, 0));
4211       if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I.index()))
4212         TysForDecl.push_back(Arg->getType());
4213       Args.push_back(Arg);
4214     }
4215 
4216     Function *VectorF;
4217     if (UseVectorIntrinsic) {
4218       // Use vector version of the intrinsic.
4219       if (VF.isVector())
4220         TysForDecl[0] = VectorType::get(CI.getType()->getScalarType(), VF);
4221       Module *M = State.Builder.GetInsertBlock()->getModule();
4222       VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4223       assert(VectorF && "Can't retrieve vector intrinsic.");
4224     } else {
4225       // Use vector version of the function call.
4226       const VFShape Shape = VFShape::get(CI, VF, false /*HasGlobalPred*/);
4227 #ifndef NDEBUG
4228       assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
4229              "Can't create vector function.");
4230 #endif
4231       VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
4232     }
4233       SmallVector<OperandBundleDef, 1> OpBundles;
4234       CI.getOperandBundlesAsDefs(OpBundles);
4235       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4236 
4237       if (isa<FPMathOperator>(V))
4238         V->copyFastMathFlags(&CI);
4239 
4240       State.set(Def, V, Part);
4241       State.addMetadata(V, &CI);
4242   }
4243 }
4244 
4245 void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
4246   // We should not collect Scalars more than once per VF. Right now, this
4247   // function is called from collectUniformsAndScalars(), which already does
4248   // this check. Collecting Scalars for VF=1 does not make any sense.
4249   assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
4250          "This function should not be visited twice for the same VF");
4251 
4252   // This avoids any chances of creating a REPLICATE recipe during planning
4253   // since that would result in generation of scalarized code during execution,
4254   // which is not supported for scalable vectors.
4255   if (VF.isScalable()) {
4256     Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
4257     return;
4258   }
4259 
4260   SmallSetVector<Instruction *, 8> Worklist;
4261 
4262   // These sets are used to seed the analysis with pointers used by memory
4263   // accesses that will remain scalar.
4264   SmallSetVector<Instruction *, 8> ScalarPtrs;
4265   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4266   auto *Latch = TheLoop->getLoopLatch();
4267 
4268   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4269   // The pointer operands of loads and stores will be scalar as long as the
4270   // memory access is not a gather or scatter operation. The value operand of a
4271   // store will remain scalar if the store is scalarized.
4272   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4273     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4274     assert(WideningDecision != CM_Unknown &&
4275            "Widening decision should be ready at this moment");
4276     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4277       if (Ptr == Store->getValueOperand())
4278         return WideningDecision == CM_Scalarize;
4279     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4280            "Ptr is neither a value or pointer operand");
4281     return WideningDecision != CM_GatherScatter;
4282   };
4283 
4284   // A helper that returns true if the given value is a bitcast or
4285   // getelementptr instruction contained in the loop.
4286   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4287     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4288             isa<GetElementPtrInst>(V)) &&
4289            !TheLoop->isLoopInvariant(V);
4290   };
4291 
4292   // A helper that evaluates a memory access's use of a pointer. If the use will
4293   // be a scalar use and the pointer is only used by memory accesses, we place
4294   // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4295   // PossibleNonScalarPtrs.
4296   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4297     // We only care about bitcast and getelementptr instructions contained in
4298     // the loop.
4299     if (!isLoopVaryingBitCastOrGEP(Ptr))
4300       return;
4301 
4302     // If the pointer has already been identified as scalar (e.g., if it was
4303     // also identified as uniform), there's nothing to do.
4304     auto *I = cast<Instruction>(Ptr);
4305     if (Worklist.count(I))
4306       return;
4307 
4308     // If the use of the pointer will be a scalar use, and all users of the
4309     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4310     // place the pointer in PossibleNonScalarPtrs.
4311     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4312           return isa<LoadInst>(U) || isa<StoreInst>(U);
4313         }))
4314       ScalarPtrs.insert(I);
4315     else
4316       PossibleNonScalarPtrs.insert(I);
4317   };
4318 
4319   // We seed the scalars analysis with three classes of instructions: (1)
4320   // instructions marked uniform-after-vectorization and (2) bitcast,
4321   // getelementptr and (pointer) phi instructions used by memory accesses
4322   // requiring a scalar use.
4323   //
4324   // (1) Add to the worklist all instructions that have been identified as
4325   // uniform-after-vectorization.
4326   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4327 
4328   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4329   // memory accesses requiring a scalar use. The pointer operands of loads and
4330   // stores will be scalar as long as the memory accesses is not a gather or
4331   // scatter operation. The value operand of a store will remain scalar if the
4332   // store is scalarized.
4333   for (auto *BB : TheLoop->blocks())
4334     for (auto &I : *BB) {
4335       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4336         evaluatePtrUse(Load, Load->getPointerOperand());
4337       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4338         evaluatePtrUse(Store, Store->getPointerOperand());
4339         evaluatePtrUse(Store, Store->getValueOperand());
4340       }
4341     }
4342   for (auto *I : ScalarPtrs)
4343     if (!PossibleNonScalarPtrs.count(I)) {
4344       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4345       Worklist.insert(I);
4346     }
4347 
4348   // Insert the forced scalars.
4349   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
4350   // induction variable when the PHI user is scalarized.
4351   auto ForcedScalar = ForcedScalars.find(VF);
4352   if (ForcedScalar != ForcedScalars.end())
4353     for (auto *I : ForcedScalar->second)
4354       Worklist.insert(I);
4355 
4356   // Expand the worklist by looking through any bitcasts and getelementptr
4357   // instructions we've already identified as scalar. This is similar to the
4358   // expansion step in collectLoopUniforms(); however, here we're only
4359   // expanding to include additional bitcasts and getelementptr instructions.
4360   unsigned Idx = 0;
4361   while (Idx != Worklist.size()) {
4362     Instruction *Dst = Worklist[Idx++];
4363     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4364       continue;
4365     auto *Src = cast<Instruction>(Dst->getOperand(0));
4366     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4367           auto *J = cast<Instruction>(U);
4368           return !TheLoop->contains(J) || Worklist.count(J) ||
4369                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4370                   isScalarUse(J, Src));
4371         })) {
4372       Worklist.insert(Src);
4373       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4374     }
4375   }
4376 
4377   // An induction variable will remain scalar if all users of the induction
4378   // variable and induction variable update remain scalar.
4379   for (auto &Induction : Legal->getInductionVars()) {
4380     auto *Ind = Induction.first;
4381     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4382 
4383     // If tail-folding is applied, the primary induction variable will be used
4384     // to feed a vector compare.
4385     if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
4386       continue;
4387 
4388     // Returns true if \p Indvar is a pointer induction that is used directly by
4389     // load/store instruction \p I.
4390     auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
4391                                               Instruction *I) {
4392       return Induction.second.getKind() ==
4393                  InductionDescriptor::IK_PtrInduction &&
4394              (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
4395              Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
4396     };
4397 
4398     // Determine if all users of the induction variable are scalar after
4399     // vectorization.
4400     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4401       auto *I = cast<Instruction>(U);
4402       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4403              IsDirectLoadStoreFromPtrIndvar(Ind, I);
4404     });
4405     if (!ScalarInd)
4406       continue;
4407 
4408     // Determine if all users of the induction variable update instruction are
4409     // scalar after vectorization.
4410     auto ScalarIndUpdate =
4411         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4412           auto *I = cast<Instruction>(U);
4413           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4414                  IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
4415         });
4416     if (!ScalarIndUpdate)
4417       continue;
4418 
4419     // The induction variable and its update instruction will remain scalar.
4420     Worklist.insert(Ind);
4421     Worklist.insert(IndUpdate);
4422     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4423     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4424                       << "\n");
4425   }
4426 
4427   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4428 }
4429 
4430 bool LoopVectorizationCostModel::isScalarWithPredication(
4431     Instruction *I, ElementCount VF) const {
4432   if (!blockNeedsPredicationForAnyReason(I->getParent()))
4433     return false;
4434   switch(I->getOpcode()) {
4435   default:
4436     break;
4437   case Instruction::Load:
4438   case Instruction::Store: {
4439     if (!Legal->isMaskRequired(I))
4440       return false;
4441     auto *Ptr = getLoadStorePointerOperand(I);
4442     auto *Ty = getLoadStoreType(I);
4443     Type *VTy = Ty;
4444     if (VF.isVector())
4445       VTy = VectorType::get(Ty, VF);
4446     const Align Alignment = getLoadStoreAlignment(I);
4447     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4448                                 TTI.isLegalMaskedGather(VTy, Alignment))
4449                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4450                                 TTI.isLegalMaskedScatter(VTy, Alignment));
4451   }
4452   case Instruction::UDiv:
4453   case Instruction::SDiv:
4454   case Instruction::SRem:
4455   case Instruction::URem:
4456     // TODO: We can use the loop-preheader as context point here and get
4457     // context sensitive reasoning
4458     return !isSafeToSpeculativelyExecute(I);
4459   }
4460   return false;
4461 }
4462 
4463 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
4464     Instruction *I, ElementCount VF) {
4465   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4466   assert(getWideningDecision(I, VF) == CM_Unknown &&
4467          "Decision should not be set yet.");
4468   auto *Group = getInterleavedAccessGroup(I);
4469   assert(Group && "Must have a group.");
4470 
4471   // If the instruction's allocated size doesn't equal it's type size, it
4472   // requires padding and will be scalarized.
4473   auto &DL = I->getModule()->getDataLayout();
4474   auto *ScalarTy = getLoadStoreType(I);
4475   if (hasIrregularType(ScalarTy, DL))
4476     return false;
4477 
4478   // If the group involves a non-integral pointer, we may not be able to
4479   // losslessly cast all values to a common type.
4480   unsigned InterleaveFactor = Group->getFactor();
4481   bool ScalarNI = DL.isNonIntegralPointerType(ScalarTy);
4482   for (unsigned i = 0; i < InterleaveFactor; i++) {
4483     Instruction *Member = Group->getMember(i);
4484     if (!Member)
4485       continue;
4486     auto *MemberTy = getLoadStoreType(Member);
4487     bool MemberNI = DL.isNonIntegralPointerType(MemberTy);
4488     // Don't coerce non-integral pointers to integers or vice versa.
4489     if (MemberNI != ScalarNI) {
4490       // TODO: Consider adding special nullptr value case here
4491       return false;
4492     } else if (MemberNI && ScalarNI &&
4493                ScalarTy->getPointerAddressSpace() !=
4494                MemberTy->getPointerAddressSpace()) {
4495       return false;
4496     }
4497   }
4498 
4499   // Check if masking is required.
4500   // A Group may need masking for one of two reasons: it resides in a block that
4501   // needs predication, or it was decided to use masking to deal with gaps
4502   // (either a gap at the end of a load-access that may result in a speculative
4503   // load, or any gaps in a store-access).
4504   bool PredicatedAccessRequiresMasking =
4505       blockNeedsPredicationForAnyReason(I->getParent()) &&
4506       Legal->isMaskRequired(I);
4507   bool LoadAccessWithGapsRequiresEpilogMasking =
4508       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
4509       !isScalarEpilogueAllowed();
4510   bool StoreAccessWithGapsRequiresMasking =
4511       isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor());
4512   if (!PredicatedAccessRequiresMasking &&
4513       !LoadAccessWithGapsRequiresEpilogMasking &&
4514       !StoreAccessWithGapsRequiresMasking)
4515     return true;
4516 
4517   // If masked interleaving is required, we expect that the user/target had
4518   // enabled it, because otherwise it either wouldn't have been created or
4519   // it should have been invalidated by the CostModel.
4520   assert(useMaskedInterleavedAccesses(TTI) &&
4521          "Masked interleave-groups for predicated accesses are not enabled.");
4522 
4523   if (Group->isReverse())
4524     return false;
4525 
4526   auto *Ty = getLoadStoreType(I);
4527   const Align Alignment = getLoadStoreAlignment(I);
4528   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4529                           : TTI.isLegalMaskedStore(Ty, Alignment);
4530 }
4531 
4532 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
4533     Instruction *I, ElementCount VF) {
4534   // Get and ensure we have a valid memory instruction.
4535   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
4536 
4537   auto *Ptr = getLoadStorePointerOperand(I);
4538   auto *ScalarTy = getLoadStoreType(I);
4539 
4540   // In order to be widened, the pointer should be consecutive, first of all.
4541   if (!Legal->isConsecutivePtr(ScalarTy, Ptr))
4542     return false;
4543 
4544   // If the instruction is a store located in a predicated block, it will be
4545   // scalarized.
4546   if (isScalarWithPredication(I, VF))
4547     return false;
4548 
4549   // If the instruction's allocated size doesn't equal it's type size, it
4550   // requires padding and will be scalarized.
4551   auto &DL = I->getModule()->getDataLayout();
4552   if (hasIrregularType(ScalarTy, DL))
4553     return false;
4554 
4555   return true;
4556 }
4557 
4558 void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
4559   // We should not collect Uniforms more than once per VF. Right now,
4560   // this function is called from collectUniformsAndScalars(), which
4561   // already does this check. Collecting Uniforms for VF=1 does not make any
4562   // sense.
4563 
4564   assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
4565          "This function should not be visited twice for the same VF");
4566 
4567   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4568   // not analyze again.  Uniforms.count(VF) will return 1.
4569   Uniforms[VF].clear();
4570 
4571   // We now know that the loop is vectorizable!
4572   // Collect instructions inside the loop that will remain uniform after
4573   // vectorization.
4574 
4575   // Global values, params and instructions outside of current loop are out of
4576   // scope.
4577   auto isOutOfScope = [&](Value *V) -> bool {
4578     Instruction *I = dyn_cast<Instruction>(V);
4579     return (!I || !TheLoop->contains(I));
4580   };
4581 
4582   // Worklist containing uniform instructions demanding lane 0.
4583   SetVector<Instruction *> Worklist;
4584   BasicBlock *Latch = TheLoop->getLoopLatch();
4585 
4586   // Add uniform instructions demanding lane 0 to the worklist. Instructions
4587   // that are scalar with predication must not be considered uniform after
4588   // vectorization, because that would create an erroneous replicating region
4589   // where only a single instance out of VF should be formed.
4590   // TODO: optimize such seldom cases if found important, see PR40816.
4591   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4592     if (isOutOfScope(I)) {
4593       LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
4594                         << *I << "\n");
4595       return;
4596     }
4597     if (isScalarWithPredication(I, VF)) {
4598       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4599                         << *I << "\n");
4600       return;
4601     }
4602     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4603     Worklist.insert(I);
4604   };
4605 
4606   // Start with the conditional branch. If the branch condition is an
4607   // instruction contained in the loop that is only used by the branch, it is
4608   // uniform.
4609   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4610   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4611     addToWorklistIfAllowed(Cmp);
4612 
4613   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
4614     InstWidening WideningDecision = getWideningDecision(I, VF);
4615     assert(WideningDecision != CM_Unknown &&
4616            "Widening decision should be ready at this moment");
4617 
4618     // A uniform memory op is itself uniform.  We exclude uniform stores
4619     // here as they demand the last lane, not the first one.
4620     if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
4621       assert(WideningDecision == CM_Scalarize);
4622       return true;
4623     }
4624 
4625     return (WideningDecision == CM_Widen ||
4626             WideningDecision == CM_Widen_Reverse ||
4627             WideningDecision == CM_Interleave);
4628   };
4629 
4630 
4631   // Returns true if Ptr is the pointer operand of a memory access instruction
4632   // I, and I is known to not require scalarization.
4633   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4634     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4635   };
4636 
4637   // Holds a list of values which are known to have at least one uniform use.
4638   // Note that there may be other uses which aren't uniform.  A "uniform use"
4639   // here is something which only demands lane 0 of the unrolled iterations;
4640   // it does not imply that all lanes produce the same value (e.g. this is not
4641   // the usual meaning of uniform)
4642   SetVector<Value *> HasUniformUse;
4643 
4644   // Scan the loop for instructions which are either a) known to have only
4645   // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
4646   for (auto *BB : TheLoop->blocks())
4647     for (auto &I : *BB) {
4648       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I)) {
4649         switch (II->getIntrinsicID()) {
4650         case Intrinsic::sideeffect:
4651         case Intrinsic::experimental_noalias_scope_decl:
4652         case Intrinsic::assume:
4653         case Intrinsic::lifetime_start:
4654         case Intrinsic::lifetime_end:
4655           if (TheLoop->hasLoopInvariantOperands(&I))
4656             addToWorklistIfAllowed(&I);
4657           break;
4658         default:
4659           break;
4660         }
4661       }
4662 
4663       // ExtractValue instructions must be uniform, because the operands are
4664       // known to be loop-invariant.
4665       if (auto *EVI = dyn_cast<ExtractValueInst>(&I)) {
4666         assert(isOutOfScope(EVI->getAggregateOperand()) &&
4667                "Expected aggregate value to be loop invariant");
4668         addToWorklistIfAllowed(EVI);
4669         continue;
4670       }
4671 
4672       // If there's no pointer operand, there's nothing to do.
4673       auto *Ptr = getLoadStorePointerOperand(&I);
4674       if (!Ptr)
4675         continue;
4676 
4677       // A uniform memory op is itself uniform.  We exclude uniform stores
4678       // here as they demand the last lane, not the first one.
4679       if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
4680         addToWorklistIfAllowed(&I);
4681 
4682       if (isUniformDecision(&I, VF)) {
4683         assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
4684         HasUniformUse.insert(Ptr);
4685       }
4686     }
4687 
4688   // Add to the worklist any operands which have *only* uniform (e.g. lane 0
4689   // demanding) users.  Since loops are assumed to be in LCSSA form, this
4690   // disallows uses outside the loop as well.
4691   for (auto *V : HasUniformUse) {
4692     if (isOutOfScope(V))
4693       continue;
4694     auto *I = cast<Instruction>(V);
4695     auto UsersAreMemAccesses =
4696       llvm::all_of(I->users(), [&](User *U) -> bool {
4697         return isVectorizedMemAccessUse(cast<Instruction>(U), V);
4698       });
4699     if (UsersAreMemAccesses)
4700       addToWorklistIfAllowed(I);
4701   }
4702 
4703   // Expand Worklist in topological order: whenever a new instruction
4704   // is added , its users should be already inside Worklist.  It ensures
4705   // a uniform instruction will only be used by uniform instructions.
4706   unsigned idx = 0;
4707   while (idx != Worklist.size()) {
4708     Instruction *I = Worklist[idx++];
4709 
4710     for (auto OV : I->operand_values()) {
4711       // isOutOfScope operands cannot be uniform instructions.
4712       if (isOutOfScope(OV))
4713         continue;
4714       // First order recurrence Phi's should typically be considered
4715       // non-uniform.
4716       auto *OP = dyn_cast<PHINode>(OV);
4717       if (OP && Legal->isFirstOrderRecurrence(OP))
4718         continue;
4719       // If all the users of the operand are uniform, then add the
4720       // operand into the uniform worklist.
4721       auto *OI = cast<Instruction>(OV);
4722       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4723             auto *J = cast<Instruction>(U);
4724             return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
4725           }))
4726         addToWorklistIfAllowed(OI);
4727     }
4728   }
4729 
4730   // For an instruction to be added into Worklist above, all its users inside
4731   // the loop should also be in Worklist. However, this condition cannot be
4732   // true for phi nodes that form a cyclic dependence. We must process phi
4733   // nodes separately. An induction variable will remain uniform if all users
4734   // of the induction variable and induction variable update remain uniform.
4735   // The code below handles both pointer and non-pointer induction variables.
4736   for (auto &Induction : Legal->getInductionVars()) {
4737     auto *Ind = Induction.first;
4738     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4739 
4740     // Determine if all users of the induction variable are uniform after
4741     // vectorization.
4742     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4743       auto *I = cast<Instruction>(U);
4744       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4745              isVectorizedMemAccessUse(I, Ind);
4746     });
4747     if (!UniformInd)
4748       continue;
4749 
4750     // Determine if all users of the induction variable update instruction are
4751     // uniform after vectorization.
4752     auto UniformIndUpdate =
4753         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4754           auto *I = cast<Instruction>(U);
4755           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4756                  isVectorizedMemAccessUse(I, IndUpdate);
4757         });
4758     if (!UniformIndUpdate)
4759       continue;
4760 
4761     // The induction variable and its update instruction will remain uniform.
4762     addToWorklistIfAllowed(Ind);
4763     addToWorklistIfAllowed(IndUpdate);
4764   }
4765 
4766   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4767 }
4768 
4769 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4770   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4771 
4772   if (Legal->getRuntimePointerChecking()->Need) {
4773     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4774         "runtime pointer checks needed. Enable vectorization of this "
4775         "loop with '#pragma clang loop vectorize(enable)' when "
4776         "compiling with -Os/-Oz",
4777         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4778     return true;
4779   }
4780 
4781   if (!PSE.getPredicate().isAlwaysTrue()) {
4782     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4783         "runtime SCEV checks needed. Enable vectorization of this "
4784         "loop with '#pragma clang loop vectorize(enable)' when "
4785         "compiling with -Os/-Oz",
4786         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4787     return true;
4788   }
4789 
4790   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4791   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4792     reportVectorizationFailure("Runtime stride check for small trip count",
4793         "runtime stride == 1 checks needed. Enable vectorization of "
4794         "this loop without such check by compiling with -Os/-Oz",
4795         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4796     return true;
4797   }
4798 
4799   return false;
4800 }
4801 
4802 ElementCount
4803 LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
4804   if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
4805     return ElementCount::getScalable(0);
4806 
4807   if (Hints->isScalableVectorizationDisabled()) {
4808     reportVectorizationInfo("Scalable vectorization is explicitly disabled",
4809                             "ScalableVectorizationDisabled", ORE, TheLoop);
4810     return ElementCount::getScalable(0);
4811   }
4812 
4813   LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
4814 
4815   auto MaxScalableVF = ElementCount::getScalable(
4816       std::numeric_limits<ElementCount::ScalarTy>::max());
4817 
4818   // Test that the loop-vectorizer can legalize all operations for this MaxVF.
4819   // FIXME: While for scalable vectors this is currently sufficient, this should
4820   // be replaced by a more detailed mechanism that filters out specific VFs,
4821   // instead of invalidating vectorization for a whole set of VFs based on the
4822   // MaxVF.
4823 
4824   // Disable scalable vectorization if the loop contains unsupported reductions.
4825   if (!canVectorizeReductions(MaxScalableVF)) {
4826     reportVectorizationInfo(
4827         "Scalable vectorization not supported for the reduction "
4828         "operations found in this loop.",
4829         "ScalableVFUnfeasible", ORE, TheLoop);
4830     return ElementCount::getScalable(0);
4831   }
4832 
4833   // Disable scalable vectorization if the loop contains any instructions
4834   // with element types not supported for scalable vectors.
4835   if (any_of(ElementTypesInLoop, [&](Type *Ty) {
4836         return !Ty->isVoidTy() &&
4837                !this->TTI.isElementTypeLegalForScalableVector(Ty);
4838       })) {
4839     reportVectorizationInfo("Scalable vectorization is not supported "
4840                             "for all element types found in this loop.",
4841                             "ScalableVFUnfeasible", ORE, TheLoop);
4842     return ElementCount::getScalable(0);
4843   }
4844 
4845   if (Legal->isSafeForAnyVectorWidth())
4846     return MaxScalableVF;
4847 
4848   // Limit MaxScalableVF by the maximum safe dependence distance.
4849   Optional<unsigned> MaxVScale = TTI.getMaxVScale();
4850   if (!MaxVScale && TheFunction->hasFnAttribute(Attribute::VScaleRange))
4851     MaxVScale =
4852         TheFunction->getFnAttribute(Attribute::VScaleRange).getVScaleRangeMax();
4853   MaxScalableVF = ElementCount::getScalable(
4854       MaxVScale ? (MaxSafeElements / MaxVScale.value()) : 0);
4855   if (!MaxScalableVF)
4856     reportVectorizationInfo(
4857         "Max legal vector width too small, scalable vectorization "
4858         "unfeasible.",
4859         "ScalableVFUnfeasible", ORE, TheLoop);
4860 
4861   return MaxScalableVF;
4862 }
4863 
4864 FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
4865     unsigned ConstTripCount, ElementCount UserVF, bool FoldTailByMasking) {
4866   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
4867   unsigned SmallestType, WidestType;
4868   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
4869 
4870   // Get the maximum safe dependence distance in bits computed by LAA.
4871   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
4872   // the memory accesses that is most restrictive (involved in the smallest
4873   // dependence distance).
4874   unsigned MaxSafeElements =
4875       PowerOf2Floor(Legal->getMaxSafeVectorWidthInBits() / WidestType);
4876 
4877   auto MaxSafeFixedVF = ElementCount::getFixed(MaxSafeElements);
4878   auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements);
4879 
4880   LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
4881                     << ".\n");
4882   LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
4883                     << ".\n");
4884 
4885   // First analyze the UserVF, fall back if the UserVF should be ignored.
4886   if (UserVF) {
4887     auto MaxSafeUserVF =
4888         UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
4889 
4890     if (ElementCount::isKnownLE(UserVF, MaxSafeUserVF)) {
4891       // If `VF=vscale x N` is safe, then so is `VF=N`
4892       if (UserVF.isScalable())
4893         return FixedScalableVFPair(
4894             ElementCount::getFixed(UserVF.getKnownMinValue()), UserVF);
4895       else
4896         return UserVF;
4897     }
4898 
4899     assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
4900 
4901     // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
4902     // is better to ignore the hint and let the compiler choose a suitable VF.
4903     if (!UserVF.isScalable()) {
4904       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4905                         << " is unsafe, clamping to max safe VF="
4906                         << MaxSafeFixedVF << ".\n");
4907       ORE->emit([&]() {
4908         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4909                                           TheLoop->getStartLoc(),
4910                                           TheLoop->getHeader())
4911                << "User-specified vectorization factor "
4912                << ore::NV("UserVectorizationFactor", UserVF)
4913                << " is unsafe, clamping to maximum safe vectorization factor "
4914                << ore::NV("VectorizationFactor", MaxSafeFixedVF);
4915       });
4916       return MaxSafeFixedVF;
4917     }
4918 
4919     if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
4920       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4921                         << " is ignored because scalable vectors are not "
4922                            "available.\n");
4923       ORE->emit([&]() {
4924         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4925                                           TheLoop->getStartLoc(),
4926                                           TheLoop->getHeader())
4927                << "User-specified vectorization factor "
4928                << ore::NV("UserVectorizationFactor", UserVF)
4929                << " is ignored because the target does not support scalable "
4930                   "vectors. The compiler will pick a more suitable value.";
4931       });
4932     } else {
4933       LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
4934                         << " is unsafe. Ignoring scalable UserVF.\n");
4935       ORE->emit([&]() {
4936         return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
4937                                           TheLoop->getStartLoc(),
4938                                           TheLoop->getHeader())
4939                << "User-specified vectorization factor "
4940                << ore::NV("UserVectorizationFactor", UserVF)
4941                << " is unsafe. Ignoring the hint to let the compiler pick a "
4942                   "more suitable value.";
4943       });
4944     }
4945   }
4946 
4947   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
4948                     << " / " << WidestType << " bits.\n");
4949 
4950   FixedScalableVFPair Result(ElementCount::getFixed(1),
4951                              ElementCount::getScalable(0));
4952   if (auto MaxVF =
4953           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4954                                   MaxSafeFixedVF, FoldTailByMasking))
4955     Result.FixedVF = MaxVF;
4956 
4957   if (auto MaxVF =
4958           getMaximizedVFForTarget(ConstTripCount, SmallestType, WidestType,
4959                                   MaxSafeScalableVF, FoldTailByMasking))
4960     if (MaxVF.isScalable()) {
4961       Result.ScalableVF = MaxVF;
4962       LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
4963                         << "\n");
4964     }
4965 
4966   return Result;
4967 }
4968 
4969 FixedScalableVFPair
4970 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
4971   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4972     // TODO: It may by useful to do since it's still likely to be dynamically
4973     // uniform if the target can skip.
4974     reportVectorizationFailure(
4975         "Not inserting runtime ptr check for divergent target",
4976         "runtime pointer checks needed. Not enabled for divergent target",
4977         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4978     return FixedScalableVFPair::getNone();
4979   }
4980 
4981   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4982   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4983   if (TC == 1) {
4984     reportVectorizationFailure("Single iteration (non) loop",
4985         "loop trip count is one, irrelevant for vectorization",
4986         "SingleIterationLoop", ORE, TheLoop);
4987     return FixedScalableVFPair::getNone();
4988   }
4989 
4990   switch (ScalarEpilogueStatus) {
4991   case CM_ScalarEpilogueAllowed:
4992     return computeFeasibleMaxVF(TC, UserVF, false);
4993   case CM_ScalarEpilogueNotAllowedUsePredicate:
4994     LLVM_FALLTHROUGH;
4995   case CM_ScalarEpilogueNotNeededUsePredicate:
4996     LLVM_DEBUG(
4997         dbgs() << "LV: vector predicate hint/switch found.\n"
4998                << "LV: Not allowing scalar epilogue, creating predicated "
4999                << "vector loop.\n");
5000     break;
5001   case CM_ScalarEpilogueNotAllowedLowTripLoop:
5002     // fallthrough as a special case of OptForSize
5003   case CM_ScalarEpilogueNotAllowedOptSize:
5004     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
5005       LLVM_DEBUG(
5006           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
5007     else
5008       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
5009                         << "count.\n");
5010 
5011     // Bail if runtime checks are required, which are not good when optimising
5012     // for size.
5013     if (runtimeChecksRequired())
5014       return FixedScalableVFPair::getNone();
5015 
5016     break;
5017   }
5018 
5019   // The only loops we can vectorize without a scalar epilogue, are loops with
5020   // a bottom-test and a single exiting block. We'd have to handle the fact
5021   // that not every instruction executes on the last iteration.  This will
5022   // require a lane mask which varies through the vector loop body.  (TODO)
5023   if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
5024     // If there was a tail-folding hint/switch, but we can't fold the tail by
5025     // masking, fallback to a vectorization with a scalar epilogue.
5026     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5027       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5028                            "scalar epilogue instead.\n");
5029       ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5030       return computeFeasibleMaxVF(TC, UserVF, false);
5031     }
5032     return FixedScalableVFPair::getNone();
5033   }
5034 
5035   // Now try the tail folding
5036 
5037   // Invalidate interleave groups that require an epilogue if we can't mask
5038   // the interleave-group.
5039   if (!useMaskedInterleavedAccesses(TTI)) {
5040     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
5041            "No decisions should have been taken at this point");
5042     // Note: There is no need to invalidate any cost modeling decisions here, as
5043     // non where taken so far.
5044     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
5045   }
5046 
5047   FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(TC, UserVF, true);
5048   // Avoid tail folding if the trip count is known to be a multiple of any VF
5049   // we chose.
5050   // FIXME: The condition below pessimises the case for fixed-width vectors,
5051   // when scalable VFs are also candidates for vectorization.
5052   if (MaxFactors.FixedVF.isVector() && !MaxFactors.ScalableVF) {
5053     ElementCount MaxFixedVF = MaxFactors.FixedVF;
5054     assert((UserVF.isNonZero() || isPowerOf2_32(MaxFixedVF.getFixedValue())) &&
5055            "MaxFixedVF must be a power of 2");
5056     unsigned MaxVFtimesIC = UserIC ? MaxFixedVF.getFixedValue() * UserIC
5057                                    : MaxFixedVF.getFixedValue();
5058     ScalarEvolution *SE = PSE.getSE();
5059     const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
5060     const SCEV *ExitCount = SE->getAddExpr(
5061         BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
5062     const SCEV *Rem = SE->getURemExpr(
5063         SE->applyLoopGuards(ExitCount, TheLoop),
5064         SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
5065     if (Rem->isZero()) {
5066       // Accept MaxFixedVF if we do not have a tail.
5067       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
5068       return MaxFactors;
5069     }
5070   }
5071 
5072   // If we don't know the precise trip count, or if the trip count that we
5073   // found modulo the vectorization factor is not zero, try to fold the tail
5074   // by masking.
5075   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
5076   if (Legal->prepareToFoldTailByMasking()) {
5077     FoldTailByMasking = true;
5078     return MaxFactors;
5079   }
5080 
5081   // If there was a tail-folding hint/switch, but we can't fold the tail by
5082   // masking, fallback to a vectorization with a scalar epilogue.
5083   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
5084     LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
5085                          "scalar epilogue instead.\n");
5086     ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
5087     return MaxFactors;
5088   }
5089 
5090   if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
5091     LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
5092     return FixedScalableVFPair::getNone();
5093   }
5094 
5095   if (TC == 0) {
5096     reportVectorizationFailure(
5097         "Unable to calculate the loop count due to complex control flow",
5098         "unable to calculate the loop count due to complex control flow",
5099         "UnknownLoopCountComplexCFG", ORE, TheLoop);
5100     return FixedScalableVFPair::getNone();
5101   }
5102 
5103   reportVectorizationFailure(
5104       "Cannot optimize for size and vectorize at the same time.",
5105       "cannot optimize for size and vectorize at the same time. "
5106       "Enable vectorization of this loop with '#pragma clang loop "
5107       "vectorize(enable)' when compiling with -Os/-Oz",
5108       "NoTailLoopWithOptForSize", ORE, TheLoop);
5109   return FixedScalableVFPair::getNone();
5110 }
5111 
5112 ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
5113     unsigned ConstTripCount, unsigned SmallestType, unsigned WidestType,
5114     ElementCount MaxSafeVF, bool FoldTailByMasking) {
5115   bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
5116   TypeSize WidestRegister = TTI.getRegisterBitWidth(
5117       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5118                            : TargetTransformInfo::RGK_FixedWidthVector);
5119 
5120   // Convenience function to return the minimum of two ElementCounts.
5121   auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
5122     assert((LHS.isScalable() == RHS.isScalable()) &&
5123            "Scalable flags must match");
5124     return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
5125   };
5126 
5127   // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
5128   // Note that both WidestRegister and WidestType may not be a powers of 2.
5129   auto MaxVectorElementCount = ElementCount::get(
5130       PowerOf2Floor(WidestRegister.getKnownMinSize() / WidestType),
5131       ComputeScalableMaxVF);
5132   MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
5133   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5134                     << (MaxVectorElementCount * WidestType) << " bits.\n");
5135 
5136   if (!MaxVectorElementCount) {
5137     LLVM_DEBUG(dbgs() << "LV: The target has no "
5138                       << (ComputeScalableMaxVF ? "scalable" : "fixed")
5139                       << " vector registers.\n");
5140     return ElementCount::getFixed(1);
5141   }
5142 
5143   const auto TripCountEC = ElementCount::getFixed(ConstTripCount);
5144   if (ConstTripCount &&
5145       ElementCount::isKnownLE(TripCountEC, MaxVectorElementCount) &&
5146       (!FoldTailByMasking || isPowerOf2_32(ConstTripCount))) {
5147     // If loop trip count (TC) is known at compile time there is no point in
5148     // choosing VF greater than TC (as done in the loop below). Select maximum
5149     // power of two which doesn't exceed TC.
5150     // If MaxVectorElementCount is scalable, we only fall back on a fixed VF
5151     // when the TC is less than or equal to the known number of lanes.
5152     auto ClampedConstTripCount = PowerOf2Floor(ConstTripCount);
5153     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
5154                          "exceeding the constant trip count: "
5155                       << ClampedConstTripCount << "\n");
5156     return ElementCount::getFixed(ClampedConstTripCount);
5157   }
5158 
5159   TargetTransformInfo::RegisterKind RegKind =
5160       ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
5161                            : TargetTransformInfo::RGK_FixedWidthVector;
5162   ElementCount MaxVF = MaxVectorElementCount;
5163   if (MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
5164                             TTI.shouldMaximizeVectorBandwidth(RegKind))) {
5165     auto MaxVectorElementCountMaxBW = ElementCount::get(
5166         PowerOf2Floor(WidestRegister.getKnownMinSize() / SmallestType),
5167         ComputeScalableMaxVF);
5168     MaxVectorElementCountMaxBW = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
5169 
5170     // Collect all viable vectorization factors larger than the default MaxVF
5171     // (i.e. MaxVectorElementCount).
5172     SmallVector<ElementCount, 8> VFs;
5173     for (ElementCount VS = MaxVectorElementCount * 2;
5174          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
5175       VFs.push_back(VS);
5176 
5177     // For each VF calculate its register usage.
5178     auto RUs = calculateRegisterUsage(VFs);
5179 
5180     // Select the largest VF which doesn't require more registers than existing
5181     // ones.
5182     for (int i = RUs.size() - 1; i >= 0; --i) {
5183       bool Selected = true;
5184       for (auto &pair : RUs[i].MaxLocalUsers) {
5185         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5186         if (pair.second > TargetNumRegisters)
5187           Selected = false;
5188       }
5189       if (Selected) {
5190         MaxVF = VFs[i];
5191         break;
5192       }
5193     }
5194     if (ElementCount MinVF =
5195             TTI.getMinimumVF(SmallestType, ComputeScalableMaxVF)) {
5196       if (ElementCount::isKnownLT(MaxVF, MinVF)) {
5197         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5198                           << ") with target's minimum: " << MinVF << '\n');
5199         MaxVF = MinVF;
5200       }
5201     }
5202 
5203     // Invalidate any widening decisions we might have made, in case the loop
5204     // requires prediction (decided later), but we have already made some
5205     // load/store widening decisions.
5206     invalidateCostModelingDecisions();
5207   }
5208   return MaxVF;
5209 }
5210 
5211 Optional<unsigned> LoopVectorizationCostModel::getVScaleForTuning() const {
5212   if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) {
5213     auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange);
5214     auto Min = Attr.getVScaleRangeMin();
5215     auto Max = Attr.getVScaleRangeMax();
5216     if (Max && Min == Max)
5217       return Max;
5218   }
5219 
5220   return TTI.getVScaleForTuning();
5221 }
5222 
5223 bool LoopVectorizationCostModel::isMoreProfitable(
5224     const VectorizationFactor &A, const VectorizationFactor &B) const {
5225   InstructionCost CostA = A.Cost;
5226   InstructionCost CostB = B.Cost;
5227 
5228   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
5229 
5230   if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
5231       MaxTripCount) {
5232     // If we are folding the tail and the trip count is a known (possibly small)
5233     // constant, the trip count will be rounded up to an integer number of
5234     // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
5235     // which we compare directly. When not folding the tail, the total cost will
5236     // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
5237     // approximated with the per-lane cost below instead of using the tripcount
5238     // as here.
5239     auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
5240     auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
5241     return RTCostA < RTCostB;
5242   }
5243 
5244   // Improve estimate for the vector width if it is scalable.
5245   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
5246   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
5247   if (Optional<unsigned> VScale = getVScaleForTuning()) {
5248     if (A.Width.isScalable())
5249       EstimatedWidthA *= VScale.value();
5250     if (B.Width.isScalable())
5251       EstimatedWidthB *= VScale.value();
5252   }
5253 
5254   // Assume vscale may be larger than 1 (or the value being tuned for),
5255   // so that scalable vectorization is slightly favorable over fixed-width
5256   // vectorization.
5257   if (A.Width.isScalable() && !B.Width.isScalable())
5258     return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
5259 
5260   // To avoid the need for FP division:
5261   //      (CostA / A.Width) < (CostB / B.Width)
5262   // <=>  (CostA * B.Width) < (CostB * A.Width)
5263   return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA);
5264 }
5265 
5266 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
5267     const ElementCountSet &VFCandidates) {
5268   InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
5269   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
5270   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
5271   assert(VFCandidates.count(ElementCount::getFixed(1)) &&
5272          "Expected Scalar VF to be a candidate");
5273 
5274   const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
5275                                        ExpectedCost);
5276   VectorizationFactor ChosenFactor = ScalarCost;
5277 
5278   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5279   if (ForceVectorization && VFCandidates.size() > 1) {
5280     // Ignore scalar width, because the user explicitly wants vectorization.
5281     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5282     // evaluation.
5283     ChosenFactor.Cost = InstructionCost::getMax();
5284   }
5285 
5286   SmallVector<InstructionVFPair> InvalidCosts;
5287   for (const auto &i : VFCandidates) {
5288     // The cost for scalar VF=1 is already calculated, so ignore it.
5289     if (i.isScalar())
5290       continue;
5291 
5292     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
5293     VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
5294 
5295 #ifndef NDEBUG
5296     unsigned AssumedMinimumVscale = 1;
5297     if (Optional<unsigned> VScale = getVScaleForTuning())
5298       AssumedMinimumVscale = *VScale;
5299     unsigned Width =
5300         Candidate.Width.isScalable()
5301             ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
5302             : Candidate.Width.getFixedValue();
5303     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5304                       << " costs: " << (Candidate.Cost / Width));
5305     if (i.isScalable())
5306       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
5307                         << AssumedMinimumVscale << ")");
5308     LLVM_DEBUG(dbgs() << ".\n");
5309 #endif
5310 
5311     if (!C.second && !ForceVectorization) {
5312       LLVM_DEBUG(
5313           dbgs() << "LV: Not considering vector loop of width " << i
5314                  << " because it will not generate any vector instructions.\n");
5315       continue;
5316     }
5317 
5318     // If profitable add it to ProfitableVF list.
5319     if (isMoreProfitable(Candidate, ScalarCost))
5320       ProfitableVFs.push_back(Candidate);
5321 
5322     if (isMoreProfitable(Candidate, ChosenFactor))
5323       ChosenFactor = Candidate;
5324   }
5325 
5326   // Emit a report of VFs with invalid costs in the loop.
5327   if (!InvalidCosts.empty()) {
5328     // Group the remarks per instruction, keeping the instruction order from
5329     // InvalidCosts.
5330     std::map<Instruction *, unsigned> Numbering;
5331     unsigned I = 0;
5332     for (auto &Pair : InvalidCosts)
5333       if (!Numbering.count(Pair.first))
5334         Numbering[Pair.first] = I++;
5335 
5336     // Sort the list, first on instruction(number) then on VF.
5337     llvm::sort(InvalidCosts,
5338                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
5339                  if (Numbering[A.first] != Numbering[B.first])
5340                    return Numbering[A.first] < Numbering[B.first];
5341                  ElementCountComparator ECC;
5342                  return ECC(A.second, B.second);
5343                });
5344 
5345     // For a list of ordered instruction-vf pairs:
5346     //   [(load, vf1), (load, vf2), (store, vf1)]
5347     // Group the instructions together to emit separate remarks for:
5348     //   load  (vf1, vf2)
5349     //   store (vf1)
5350     auto Tail = ArrayRef<InstructionVFPair>(InvalidCosts);
5351     auto Subset = ArrayRef<InstructionVFPair>();
5352     do {
5353       if (Subset.empty())
5354         Subset = Tail.take_front(1);
5355 
5356       Instruction *I = Subset.front().first;
5357 
5358       // If the next instruction is different, or if there are no other pairs,
5359       // emit a remark for the collated subset. e.g.
5360       //   [(load, vf1), (load, vf2))]
5361       // to emit:
5362       //  remark: invalid costs for 'load' at VF=(vf, vf2)
5363       if (Subset == Tail || Tail[Subset.size()].first != I) {
5364         std::string OutString;
5365         raw_string_ostream OS(OutString);
5366         assert(!Subset.empty() && "Unexpected empty range");
5367         OS << "Instruction with invalid costs prevented vectorization at VF=(";
5368         for (auto &Pair : Subset)
5369           OS << (Pair.second == Subset.front().second ? "" : ", ")
5370              << Pair.second;
5371         OS << "):";
5372         if (auto *CI = dyn_cast<CallInst>(I))
5373           OS << " call to " << CI->getCalledFunction()->getName();
5374         else
5375           OS << " " << I->getOpcodeName();
5376         OS.flush();
5377         reportVectorizationInfo(OutString, "InvalidCost", ORE, TheLoop, I);
5378         Tail = Tail.drop_front(Subset.size());
5379         Subset = {};
5380       } else
5381         // Grow the subset by one element
5382         Subset = Tail.take_front(Subset.size() + 1);
5383     } while (!Tail.empty());
5384   }
5385 
5386   if (!EnableCondStoresVectorization && NumPredStores) {
5387     reportVectorizationFailure("There are conditional stores.",
5388         "store that is conditionally executed prevents vectorization",
5389         "ConditionalStore", ORE, TheLoop);
5390     ChosenFactor = ScalarCost;
5391   }
5392 
5393   LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
5394                  !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
5395              << "LV: Vectorization seems to be not beneficial, "
5396              << "but was forced by a user.\n");
5397   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
5398   return ChosenFactor;
5399 }
5400 
5401 bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
5402     const Loop &L, ElementCount VF) const {
5403   // Cross iteration phis such as reductions need special handling and are
5404   // currently unsupported.
5405   if (any_of(L.getHeader()->phis(),
5406              [&](PHINode &Phi) { return Legal->isFirstOrderRecurrence(&Phi); }))
5407     return false;
5408 
5409   // Phis with uses outside of the loop require special handling and are
5410   // currently unsupported.
5411   for (auto &Entry : Legal->getInductionVars()) {
5412     // Look for uses of the value of the induction at the last iteration.
5413     Value *PostInc = Entry.first->getIncomingValueForBlock(L.getLoopLatch());
5414     for (User *U : PostInc->users())
5415       if (!L.contains(cast<Instruction>(U)))
5416         return false;
5417     // Look for uses of penultimate value of the induction.
5418     for (User *U : Entry.first->users())
5419       if (!L.contains(cast<Instruction>(U)))
5420         return false;
5421   }
5422 
5423   // Induction variables that are widened require special handling that is
5424   // currently not supported.
5425   if (any_of(Legal->getInductionVars(), [&](auto &Entry) {
5426         return !(this->isScalarAfterVectorization(Entry.first, VF) ||
5427                  this->isProfitableToScalarize(Entry.first, VF));
5428       }))
5429     return false;
5430 
5431   // Epilogue vectorization code has not been auditted to ensure it handles
5432   // non-latch exits properly.  It may be fine, but it needs auditted and
5433   // tested.
5434   if (L.getExitingBlock() != L.getLoopLatch())
5435     return false;
5436 
5437   return true;
5438 }
5439 
5440 bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
5441     const ElementCount VF) const {
5442   // FIXME: We need a much better cost-model to take different parameters such
5443   // as register pressure, code size increase and cost of extra branches into
5444   // account. For now we apply a very crude heuristic and only consider loops
5445   // with vectorization factors larger than a certain value.
5446   // We also consider epilogue vectorization unprofitable for targets that don't
5447   // consider interleaving beneficial (eg. MVE).
5448   if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1)
5449     return false;
5450   // FIXME: We should consider changing the threshold for scalable
5451   // vectors to take VScaleForTuning into account.
5452   if (VF.getKnownMinValue() >= EpilogueVectorizationMinVF)
5453     return true;
5454   return false;
5455 }
5456 
5457 VectorizationFactor
5458 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
5459     const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
5460   VectorizationFactor Result = VectorizationFactor::Disabled();
5461   if (!EnableEpilogueVectorization) {
5462     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
5463     return Result;
5464   }
5465 
5466   if (!isScalarEpilogueAllowed()) {
5467     LLVM_DEBUG(
5468         dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
5469                   "allowed.\n";);
5470     return Result;
5471   }
5472 
5473   // Not really a cost consideration, but check for unsupported cases here to
5474   // simplify the logic.
5475   if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
5476     LLVM_DEBUG(
5477         dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
5478                   "not a supported candidate.\n";);
5479     return Result;
5480   }
5481 
5482   if (EpilogueVectorizationForceVF > 1) {
5483     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
5484     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
5485     if (LVP.hasPlanWithVF(ForcedEC))
5486       return {ForcedEC, 0, 0};
5487     else {
5488       LLVM_DEBUG(
5489           dbgs()
5490               << "LEV: Epilogue vectorization forced factor is not viable.\n";);
5491       return Result;
5492     }
5493   }
5494 
5495   if (TheLoop->getHeader()->getParent()->hasOptSize() ||
5496       TheLoop->getHeader()->getParent()->hasMinSize()) {
5497     LLVM_DEBUG(
5498         dbgs()
5499             << "LEV: Epilogue vectorization skipped due to opt for size.\n";);
5500     return Result;
5501   }
5502 
5503   if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
5504     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
5505                          "this loop\n");
5506     return Result;
5507   }
5508 
5509   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
5510   // the main loop handles 8 lanes per iteration. We could still benefit from
5511   // vectorizing the epilogue loop with VF=4.
5512   ElementCount EstimatedRuntimeVF = MainLoopVF;
5513   if (MainLoopVF.isScalable()) {
5514     EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
5515     if (Optional<unsigned> VScale = getVScaleForTuning())
5516       EstimatedRuntimeVF *= *VScale;
5517   }
5518 
5519   for (auto &NextVF : ProfitableVFs)
5520     if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
5521           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
5522          ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
5523         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
5524         LVP.hasPlanWithVF(NextVF.Width))
5525       Result = NextVF;
5526 
5527   if (Result != VectorizationFactor::Disabled())
5528     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
5529                       << Result.Width << "\n";);
5530   return Result;
5531 }
5532 
5533 std::pair<unsigned, unsigned>
5534 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5535   unsigned MinWidth = -1U;
5536   unsigned MaxWidth = 8;
5537   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5538   // For in-loop reductions, no element types are added to ElementTypesInLoop
5539   // if there are no loads/stores in the loop. In this case, check through the
5540   // reduction variables to determine the maximum width.
5541   if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
5542     // Reset MaxWidth so that we can find the smallest type used by recurrences
5543     // in the loop.
5544     MaxWidth = -1U;
5545     for (auto &PhiDescriptorPair : Legal->getReductionVars()) {
5546       const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
5547       // When finding the min width used by the recurrence we need to account
5548       // for casts on the input operands of the recurrence.
5549       MaxWidth = std::min<unsigned>(
5550           MaxWidth, std::min<unsigned>(
5551                         RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
5552                         RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
5553     }
5554   } else {
5555     for (Type *T : ElementTypesInLoop) {
5556       MinWidth = std::min<unsigned>(
5557           MinWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5558       MaxWidth = std::max<unsigned>(
5559           MaxWidth, DL.getTypeSizeInBits(T->getScalarType()).getFixedSize());
5560     }
5561   }
5562   return {MinWidth, MaxWidth};
5563 }
5564 
5565 void LoopVectorizationCostModel::collectElementTypesForWidening() {
5566   ElementTypesInLoop.clear();
5567   // For each block.
5568   for (BasicBlock *BB : TheLoop->blocks()) {
5569     // For each instruction in the loop.
5570     for (Instruction &I : BB->instructionsWithoutDebug()) {
5571       Type *T = I.getType();
5572 
5573       // Skip ignored values.
5574       if (ValuesToIgnore.count(&I))
5575         continue;
5576 
5577       // Only examine Loads, Stores and PHINodes.
5578       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5579         continue;
5580 
5581       // Examine PHI nodes that are reduction variables. Update the type to
5582       // account for the recurrence type.
5583       if (auto *PN = dyn_cast<PHINode>(&I)) {
5584         if (!Legal->isReductionVariable(PN))
5585           continue;
5586         const RecurrenceDescriptor &RdxDesc =
5587             Legal->getReductionVars().find(PN)->second;
5588         if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
5589             TTI.preferInLoopReduction(RdxDesc.getOpcode(),
5590                                       RdxDesc.getRecurrenceType(),
5591                                       TargetTransformInfo::ReductionFlags()))
5592           continue;
5593         T = RdxDesc.getRecurrenceType();
5594       }
5595 
5596       // Examine the stored values.
5597       if (auto *ST = dyn_cast<StoreInst>(&I))
5598         T = ST->getValueOperand()->getType();
5599 
5600       assert(T->isSized() &&
5601              "Expected the load/store/recurrence type to be sized");
5602 
5603       ElementTypesInLoop.insert(T);
5604     }
5605   }
5606 }
5607 
5608 unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5609                                                            unsigned LoopCost) {
5610   // -- The interleave heuristics --
5611   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5612   // There are many micro-architectural considerations that we can't predict
5613   // at this level. For example, frontend pressure (on decode or fetch) due to
5614   // code size, or the number and capabilities of the execution ports.
5615   //
5616   // We use the following heuristics to select the interleave count:
5617   // 1. If the code has reductions, then we interleave to break the cross
5618   // iteration dependency.
5619   // 2. If the loop is really small, then we interleave to reduce the loop
5620   // overhead.
5621   // 3. We don't interleave if we think that we will spill registers to memory
5622   // due to the increased register pressure.
5623 
5624   if (!isScalarEpilogueAllowed())
5625     return 1;
5626 
5627   // We used the distance for the interleave count.
5628   if (Legal->getMaxSafeDepDistBytes() != -1U)
5629     return 1;
5630 
5631   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5632   const bool HasReductions = !Legal->getReductionVars().empty();
5633   // Do not interleave loops with a relatively small known or estimated trip
5634   // count. But we will interleave when InterleaveSmallLoopScalarReduction is
5635   // enabled, and the code has scalar reductions(HasReductions && VF = 1),
5636   // because with the above conditions interleaving can expose ILP and break
5637   // cross iteration dependences for reductions.
5638   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
5639       !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
5640     return 1;
5641 
5642   // If we did not calculate the cost for VF (because the user selected the VF)
5643   // then we calculate the cost of VF here.
5644   if (LoopCost == 0) {
5645     InstructionCost C = expectedCost(VF).first;
5646     assert(C.isValid() && "Expected to have chosen a VF with valid cost");
5647     LoopCost = *C.getValue();
5648 
5649     // Loop body is free and there is no need for interleaving.
5650     if (LoopCost == 0)
5651       return 1;
5652   }
5653 
5654   RegisterUsage R = calculateRegisterUsage({VF})[0];
5655   // We divide by these constants so assume that we have at least one
5656   // instruction that uses at least one register.
5657   for (auto& pair : R.MaxLocalUsers) {
5658     pair.second = std::max(pair.second, 1U);
5659   }
5660 
5661   // We calculate the interleave count using the following formula.
5662   // Subtract the number of loop invariants from the number of available
5663   // registers. These registers are used by all of the interleaved instances.
5664   // Next, divide the remaining registers by the number of registers that is
5665   // required by the loop, in order to estimate how many parallel instances
5666   // fit without causing spills. All of this is rounded down if necessary to be
5667   // a power of two. We want power of two interleave count to simplify any
5668   // addressing operations or alignment considerations.
5669   // We also want power of two interleave counts to ensure that the induction
5670   // variable of the vector loop wraps to zero, when tail is folded by masking;
5671   // this currently happens when OptForSize, in which case IC is set to 1 above.
5672   unsigned IC = UINT_MAX;
5673 
5674   for (auto& pair : R.MaxLocalUsers) {
5675     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5676     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5677                       << " registers of "
5678                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5679     if (VF.isScalar()) {
5680       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5681         TargetNumRegisters = ForceTargetNumScalarRegs;
5682     } else {
5683       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5684         TargetNumRegisters = ForceTargetNumVectorRegs;
5685     }
5686     unsigned MaxLocalUsers = pair.second;
5687     unsigned LoopInvariantRegs = 0;
5688     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5689       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5690 
5691     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5692     // Don't count the induction variable as interleaved.
5693     if (EnableIndVarRegisterHeur) {
5694       TmpIC =
5695           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5696                         std::max(1U, (MaxLocalUsers - 1)));
5697     }
5698 
5699     IC = std::min(IC, TmpIC);
5700   }
5701 
5702   // Clamp the interleave ranges to reasonable counts.
5703   unsigned MaxInterleaveCount =
5704       TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
5705 
5706   // Check if the user has overridden the max.
5707   if (VF.isScalar()) {
5708     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5709       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5710   } else {
5711     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5712       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5713   }
5714 
5715   // If trip count is known or estimated compile time constant, limit the
5716   // interleave count to be less than the trip count divided by VF, provided it
5717   // is at least 1.
5718   //
5719   // For scalable vectors we can't know if interleaving is beneficial. It may
5720   // not be beneficial for small loops if none of the lanes in the second vector
5721   // iterations is enabled. However, for larger loops, there is likely to be a
5722   // similar benefit as for fixed-width vectors. For now, we choose to leave
5723   // the InterleaveCount as if vscale is '1', although if some information about
5724   // the vector is known (e.g. min vector size), we can make a better decision.
5725   if (BestKnownTC) {
5726     MaxInterleaveCount =
5727         std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
5728     // Make sure MaxInterleaveCount is greater than 0.
5729     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
5730   }
5731 
5732   assert(MaxInterleaveCount > 0 &&
5733          "Maximum interleave count must be greater than 0");
5734 
5735   // Clamp the calculated IC to be between the 1 and the max interleave count
5736   // that the target and trip count allows.
5737   if (IC > MaxInterleaveCount)
5738     IC = MaxInterleaveCount;
5739   else
5740     // Make sure IC is greater than 0.
5741     IC = std::max(1u, IC);
5742 
5743   assert(IC > 0 && "Interleave count must be greater than 0.");
5744 
5745   // Interleave if we vectorized this loop and there is a reduction that could
5746   // benefit from interleaving.
5747   if (VF.isVector() && HasReductions) {
5748     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5749     return IC;
5750   }
5751 
5752   // For any scalar loop that either requires runtime checks or predication we
5753   // are better off leaving this to the unroller. Note that if we've already
5754   // vectorized the loop we will have done the runtime check and so interleaving
5755   // won't require further checks.
5756   bool ScalarInterleavingRequiresPredication =
5757       (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
5758          return Legal->blockNeedsPredication(BB);
5759        }));
5760   bool ScalarInterleavingRequiresRuntimePointerCheck =
5761       (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
5762 
5763   // We want to interleave small loops in order to reduce the loop overhead and
5764   // potentially expose ILP opportunities.
5765   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
5766                     << "LV: IC is " << IC << '\n'
5767                     << "LV: VF is " << VF << '\n');
5768   const bool AggressivelyInterleaveReductions =
5769       TTI.enableAggressiveInterleaving(HasReductions);
5770   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
5771       !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
5772     // We assume that the cost overhead is 1 and we use the cost model
5773     // to estimate the cost of the loop and interleave until the cost of the
5774     // loop overhead is about 5% of the cost of the loop.
5775     unsigned SmallIC =
5776         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5777 
5778     // Interleave until store/load ports (estimated by max interleave count) are
5779     // saturated.
5780     unsigned NumStores = Legal->getNumStores();
5781     unsigned NumLoads = Legal->getNumLoads();
5782     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5783     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5784 
5785     // There is little point in interleaving for reductions containing selects
5786     // and compares when VF=1 since it may just create more overhead than it's
5787     // worth for loops with small trip counts. This is because we still have to
5788     // do the final reduction after the loop.
5789     bool HasSelectCmpReductions =
5790         HasReductions &&
5791         any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5792           const RecurrenceDescriptor &RdxDesc = Reduction.second;
5793           return RecurrenceDescriptor::isSelectCmpRecurrenceKind(
5794               RdxDesc.getRecurrenceKind());
5795         });
5796     if (HasSelectCmpReductions) {
5797       LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
5798       return 1;
5799     }
5800 
5801     // If we have a scalar reduction (vector reductions are already dealt with
5802     // by this point), we can increase the critical path length if the loop
5803     // we're interleaving is inside another loop. For tree-wise reductions
5804     // set the limit to 2, and for ordered reductions it's best to disable
5805     // interleaving entirely.
5806     if (HasReductions && TheLoop->getLoopDepth() > 1) {
5807       bool HasOrderedReductions =
5808           any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
5809             const RecurrenceDescriptor &RdxDesc = Reduction.second;
5810             return RdxDesc.isOrdered();
5811           });
5812       if (HasOrderedReductions) {
5813         LLVM_DEBUG(
5814             dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
5815         return 1;
5816       }
5817 
5818       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5819       SmallIC = std::min(SmallIC, F);
5820       StoresIC = std::min(StoresIC, F);
5821       LoadsIC = std::min(LoadsIC, F);
5822     }
5823 
5824     if (EnableLoadStoreRuntimeInterleave &&
5825         std::max(StoresIC, LoadsIC) > SmallIC) {
5826       LLVM_DEBUG(
5827           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5828       return std::max(StoresIC, LoadsIC);
5829     }
5830 
5831     // If there are scalar reductions and TTI has enabled aggressive
5832     // interleaving for reductions, we will interleave to expose ILP.
5833     if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
5834         AggressivelyInterleaveReductions) {
5835       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5836       // Interleave no less than SmallIC but not as aggressive as the normal IC
5837       // to satisfy the rare situation when resources are too limited.
5838       return std::max(IC / 2, SmallIC);
5839     } else {
5840       LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5841       return SmallIC;
5842     }
5843   }
5844 
5845   // Interleave if this is a large loop (small loops are already dealt with by
5846   // this point) that could benefit from interleaving.
5847   if (AggressivelyInterleaveReductions) {
5848     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5849     return IC;
5850   }
5851 
5852   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5853   return 1;
5854 }
5855 
5856 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
5857 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
5858   // This function calculates the register usage by measuring the highest number
5859   // of values that are alive at a single location. Obviously, this is a very
5860   // rough estimation. We scan the loop in a topological order in order and
5861   // assign a number to each instruction. We use RPO to ensure that defs are
5862   // met before their users. We assume that each instruction that has in-loop
5863   // users starts an interval. We record every time that an in-loop value is
5864   // used, so we have a list of the first and last occurrences of each
5865   // instruction. Next, we transpose this data structure into a multi map that
5866   // holds the list of intervals that *end* at a specific location. This multi
5867   // map allows us to perform a linear search. We scan the instructions linearly
5868   // and record each time that a new interval starts, by placing it in a set.
5869   // If we find this value in the multi-map then we remove it from the set.
5870   // The max register usage is the maximum size of the set.
5871   // We also search for instructions that are defined outside the loop, but are
5872   // used inside the loop. We need this number separately from the max-interval
5873   // usage number because when we unroll, loop-invariant values do not take
5874   // more register.
5875   LoopBlocksDFS DFS(TheLoop);
5876   DFS.perform(LI);
5877 
5878   RegisterUsage RU;
5879 
5880   // Each 'key' in the map opens a new interval. The values
5881   // of the map are the index of the 'last seen' usage of the
5882   // instruction that is the key.
5883   using IntervalMap = DenseMap<Instruction *, unsigned>;
5884 
5885   // Maps instruction to its index.
5886   SmallVector<Instruction *, 64> IdxToInstr;
5887   // Marks the end of each interval.
5888   IntervalMap EndPoint;
5889   // Saves the list of instruction indices that are used in the loop.
5890   SmallPtrSet<Instruction *, 8> Ends;
5891   // Saves the list of values that are used in the loop but are
5892   // defined outside the loop, such as arguments and constants.
5893   SmallPtrSet<Value *, 8> LoopInvariants;
5894 
5895   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5896     for (Instruction &I : BB->instructionsWithoutDebug()) {
5897       IdxToInstr.push_back(&I);
5898 
5899       // Save the end location of each USE.
5900       for (Value *U : I.operands()) {
5901         auto *Instr = dyn_cast<Instruction>(U);
5902 
5903         // Ignore non-instruction values such as arguments, constants, etc.
5904         if (!Instr)
5905           continue;
5906 
5907         // If this instruction is outside the loop then record it and continue.
5908         if (!TheLoop->contains(Instr)) {
5909           LoopInvariants.insert(Instr);
5910           continue;
5911         }
5912 
5913         // Overwrite previous end points.
5914         EndPoint[Instr] = IdxToInstr.size();
5915         Ends.insert(Instr);
5916       }
5917     }
5918   }
5919 
5920   // Saves the list of intervals that end with the index in 'key'.
5921   using InstrList = SmallVector<Instruction *, 2>;
5922   DenseMap<unsigned, InstrList> TransposeEnds;
5923 
5924   // Transpose the EndPoints to a list of values that end at each index.
5925   for (auto &Interval : EndPoint)
5926     TransposeEnds[Interval.second].push_back(Interval.first);
5927 
5928   SmallPtrSet<Instruction *, 8> OpenIntervals;
5929   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5930   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5931 
5932   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5933 
5934   const auto &TTICapture = TTI;
5935   auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
5936     if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
5937       return 0;
5938     return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
5939   };
5940 
5941   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5942     Instruction *I = IdxToInstr[i];
5943 
5944     // Remove all of the instructions that end at this location.
5945     InstrList &List = TransposeEnds[i];
5946     for (Instruction *ToRemove : List)
5947       OpenIntervals.erase(ToRemove);
5948 
5949     // Ignore instructions that are never used within the loop.
5950     if (!Ends.count(I))
5951       continue;
5952 
5953     // Skip ignored values.
5954     if (ValuesToIgnore.count(I))
5955       continue;
5956 
5957     // For each VF find the maximum usage of registers.
5958     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5959       // Count the number of live intervals.
5960       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5961 
5962       if (VFs[j].isScalar()) {
5963         for (auto Inst : OpenIntervals) {
5964           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5965           if (RegUsage.find(ClassID) == RegUsage.end())
5966             RegUsage[ClassID] = 1;
5967           else
5968             RegUsage[ClassID] += 1;
5969         }
5970       } else {
5971         collectUniformsAndScalars(VFs[j]);
5972         for (auto Inst : OpenIntervals) {
5973           // Skip ignored values for VF > 1.
5974           if (VecValuesToIgnore.count(Inst))
5975             continue;
5976           if (isScalarAfterVectorization(Inst, VFs[j])) {
5977             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5978             if (RegUsage.find(ClassID) == RegUsage.end())
5979               RegUsage[ClassID] = 1;
5980             else
5981               RegUsage[ClassID] += 1;
5982           } else {
5983             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5984             if (RegUsage.find(ClassID) == RegUsage.end())
5985               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5986             else
5987               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5988           }
5989         }
5990       }
5991 
5992       for (auto& pair : RegUsage) {
5993         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5994           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5995         else
5996           MaxUsages[j][pair.first] = pair.second;
5997       }
5998     }
5999 
6000     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
6001                       << OpenIntervals.size() << '\n');
6002 
6003     // Add the current instruction to the list of open intervals.
6004     OpenIntervals.insert(I);
6005   }
6006 
6007   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
6008     SmallMapVector<unsigned, unsigned, 4> Invariant;
6009 
6010     for (auto Inst : LoopInvariants) {
6011       unsigned Usage =
6012           VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
6013       unsigned ClassID =
6014           TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
6015       if (Invariant.find(ClassID) == Invariant.end())
6016         Invariant[ClassID] = Usage;
6017       else
6018         Invariant[ClassID] += Usage;
6019     }
6020 
6021     LLVM_DEBUG({
6022       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
6023       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
6024              << " item\n";
6025       for (const auto &pair : MaxUsages[i]) {
6026         dbgs() << "LV(REG): RegisterClass: "
6027                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6028                << " registers\n";
6029       }
6030       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
6031              << " item\n";
6032       for (const auto &pair : Invariant) {
6033         dbgs() << "LV(REG): RegisterClass: "
6034                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
6035                << " registers\n";
6036       }
6037     });
6038 
6039     RU.LoopInvariantRegs = Invariant;
6040     RU.MaxLocalUsers = MaxUsages[i];
6041     RUs[i] = RU;
6042   }
6043 
6044   return RUs;
6045 }
6046 
6047 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
6048                                                            ElementCount VF) {
6049   // TODO: Cost model for emulated masked load/store is completely
6050   // broken. This hack guides the cost model to use an artificially
6051   // high enough value to practically disable vectorization with such
6052   // operations, except where previously deployed legality hack allowed
6053   // using very low cost values. This is to avoid regressions coming simply
6054   // from moving "masked load/store" check from legality to cost model.
6055   // Masked Load/Gather emulation was previously never allowed.
6056   // Limited number of Masked Store/Scatter emulation was allowed.
6057   assert((isPredicatedInst(I, VF) || Legal->isUniformMemOp(*I)) &&
6058          "Expecting a scalar emulated instruction");
6059   return isa<LoadInst>(I) ||
6060          (isa<StoreInst>(I) &&
6061           NumPredStores > NumberOfStoresToPredicate);
6062 }
6063 
6064 void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
6065   // If we aren't vectorizing the loop, or if we've already collected the
6066   // instructions to scalarize, there's nothing to do. Collection may already
6067   // have occurred if we have a user-selected VF and are now computing the
6068   // expected cost for interleaving.
6069   if (VF.isScalar() || VF.isZero() ||
6070       InstsToScalarize.find(VF) != InstsToScalarize.end())
6071     return;
6072 
6073   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
6074   // not profitable to scalarize any instructions, the presence of VF in the
6075   // map will indicate that we've analyzed it already.
6076   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
6077 
6078   PredicatedBBsAfterVectorization[VF].clear();
6079 
6080   // Find all the instructions that are scalar with predication in the loop and
6081   // determine if it would be better to not if-convert the blocks they are in.
6082   // If so, we also record the instructions to scalarize.
6083   for (BasicBlock *BB : TheLoop->blocks()) {
6084     if (!blockNeedsPredicationForAnyReason(BB))
6085       continue;
6086     for (Instruction &I : *BB)
6087       if (isScalarWithPredication(&I, VF)) {
6088         ScalarCostsTy ScalarCosts;
6089         // Do not apply discount if scalable, because that would lead to
6090         // invalid scalarization costs.
6091         // Do not apply discount logic if hacked cost is needed
6092         // for emulated masked memrefs.
6093         if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
6094             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
6095           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
6096         // Remember that BB will remain after vectorization.
6097         PredicatedBBsAfterVectorization[VF].insert(BB);
6098       }
6099   }
6100 }
6101 
6102 int LoopVectorizationCostModel::computePredInstDiscount(
6103     Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
6104   assert(!isUniformAfterVectorization(PredInst, VF) &&
6105          "Instruction marked uniform-after-vectorization will be predicated");
6106 
6107   // Initialize the discount to zero, meaning that the scalar version and the
6108   // vector version cost the same.
6109   InstructionCost Discount = 0;
6110 
6111   // Holds instructions to analyze. The instructions we visit are mapped in
6112   // ScalarCosts. Those instructions are the ones that would be scalarized if
6113   // we find that the scalar version costs less.
6114   SmallVector<Instruction *, 8> Worklist;
6115 
6116   // Returns true if the given instruction can be scalarized.
6117   auto canBeScalarized = [&](Instruction *I) -> bool {
6118     // We only attempt to scalarize instructions forming a single-use chain
6119     // from the original predicated block that would otherwise be vectorized.
6120     // Although not strictly necessary, we give up on instructions we know will
6121     // already be scalar to avoid traversing chains that are unlikely to be
6122     // beneficial.
6123     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
6124         isScalarAfterVectorization(I, VF))
6125       return false;
6126 
6127     // If the instruction is scalar with predication, it will be analyzed
6128     // separately. We ignore it within the context of PredInst.
6129     if (isScalarWithPredication(I, VF))
6130       return false;
6131 
6132     // If any of the instruction's operands are uniform after vectorization,
6133     // the instruction cannot be scalarized. This prevents, for example, a
6134     // masked load from being scalarized.
6135     //
6136     // We assume we will only emit a value for lane zero of an instruction
6137     // marked uniform after vectorization, rather than VF identical values.
6138     // Thus, if we scalarize an instruction that uses a uniform, we would
6139     // create uses of values corresponding to the lanes we aren't emitting code
6140     // for. This behavior can be changed by allowing getScalarValue to clone
6141     // the lane zero values for uniforms rather than asserting.
6142     for (Use &U : I->operands())
6143       if (auto *J = dyn_cast<Instruction>(U.get()))
6144         if (isUniformAfterVectorization(J, VF))
6145           return false;
6146 
6147     // Otherwise, we can scalarize the instruction.
6148     return true;
6149   };
6150 
6151   // Compute the expected cost discount from scalarizing the entire expression
6152   // feeding the predicated instruction. We currently only consider expressions
6153   // that are single-use instruction chains.
6154   Worklist.push_back(PredInst);
6155   while (!Worklist.empty()) {
6156     Instruction *I = Worklist.pop_back_val();
6157 
6158     // If we've already analyzed the instruction, there's nothing to do.
6159     if (ScalarCosts.find(I) != ScalarCosts.end())
6160       continue;
6161 
6162     // Compute the cost of the vector instruction. Note that this cost already
6163     // includes the scalarization overhead of the predicated instruction.
6164     InstructionCost VectorCost = getInstructionCost(I, VF).first;
6165 
6166     // Compute the cost of the scalarized instruction. This cost is the cost of
6167     // the instruction as if it wasn't if-converted and instead remained in the
6168     // predicated block. We will scale this cost by block probability after
6169     // computing the scalarization overhead.
6170     InstructionCost ScalarCost =
6171         VF.getFixedValue() *
6172         getInstructionCost(I, ElementCount::getFixed(1)).first;
6173 
6174     // Compute the scalarization overhead of needed insertelement instructions
6175     // and phi nodes.
6176     if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
6177       ScalarCost += TTI.getScalarizationOverhead(
6178           cast<VectorType>(ToVectorTy(I->getType(), VF)),
6179           APInt::getAllOnes(VF.getFixedValue()), true, false);
6180       ScalarCost +=
6181           VF.getFixedValue() *
6182           TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
6183     }
6184 
6185     // Compute the scalarization overhead of needed extractelement
6186     // instructions. For each of the instruction's operands, if the operand can
6187     // be scalarized, add it to the worklist; otherwise, account for the
6188     // overhead.
6189     for (Use &U : I->operands())
6190       if (auto *J = dyn_cast<Instruction>(U.get())) {
6191         assert(VectorType::isValidElementType(J->getType()) &&
6192                "Instruction has non-scalar type");
6193         if (canBeScalarized(J))
6194           Worklist.push_back(J);
6195         else if (needsExtract(J, VF)) {
6196           ScalarCost += TTI.getScalarizationOverhead(
6197               cast<VectorType>(ToVectorTy(J->getType(), VF)),
6198               APInt::getAllOnes(VF.getFixedValue()), false, true);
6199         }
6200       }
6201 
6202     // Scale the total scalar cost by block probability.
6203     ScalarCost /= getReciprocalPredBlockProb();
6204 
6205     // Compute the discount. A non-negative discount means the vector version
6206     // of the instruction costs more, and scalarizing would be beneficial.
6207     Discount += VectorCost - ScalarCost;
6208     ScalarCosts[I] = ScalarCost;
6209   }
6210 
6211   return *Discount.getValue();
6212 }
6213 
6214 LoopVectorizationCostModel::VectorizationCostTy
6215 LoopVectorizationCostModel::expectedCost(
6216     ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
6217   VectorizationCostTy Cost;
6218 
6219   // For each block.
6220   for (BasicBlock *BB : TheLoop->blocks()) {
6221     VectorizationCostTy BlockCost;
6222 
6223     // For each instruction in the old loop.
6224     for (Instruction &I : BB->instructionsWithoutDebug()) {
6225       // Skip ignored values.
6226       if (ValuesToIgnore.count(&I) ||
6227           (VF.isVector() && VecValuesToIgnore.count(&I)))
6228         continue;
6229 
6230       VectorizationCostTy C = getInstructionCost(&I, VF);
6231 
6232       // Check if we should override the cost.
6233       if (C.first.isValid() &&
6234           ForceTargetInstructionCost.getNumOccurrences() > 0)
6235         C.first = InstructionCost(ForceTargetInstructionCost);
6236 
6237       // Keep a list of instructions with invalid costs.
6238       if (Invalid && !C.first.isValid())
6239         Invalid->emplace_back(&I, VF);
6240 
6241       BlockCost.first += C.first;
6242       BlockCost.second |= C.second;
6243       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
6244                         << " for VF " << VF << " For instruction: " << I
6245                         << '\n');
6246     }
6247 
6248     // If we are vectorizing a predicated block, it will have been
6249     // if-converted. This means that the block's instructions (aside from
6250     // stores and instructions that may divide by zero) will now be
6251     // unconditionally executed. For the scalar case, we may not always execute
6252     // the predicated block, if it is an if-else block. Thus, scale the block's
6253     // cost by the probability of executing it. blockNeedsPredication from
6254     // Legal is used so as to not include all blocks in tail folded loops.
6255     if (VF.isScalar() && Legal->blockNeedsPredication(BB))
6256       BlockCost.first /= getReciprocalPredBlockProb();
6257 
6258     Cost.first += BlockCost.first;
6259     Cost.second |= BlockCost.second;
6260   }
6261 
6262   return Cost;
6263 }
6264 
6265 /// Gets Address Access SCEV after verifying that the access pattern
6266 /// is loop invariant except the induction variable dependence.
6267 ///
6268 /// This SCEV can be sent to the Target in order to estimate the address
6269 /// calculation cost.
6270 static const SCEV *getAddressAccessSCEV(
6271               Value *Ptr,
6272               LoopVectorizationLegality *Legal,
6273               PredicatedScalarEvolution &PSE,
6274               const Loop *TheLoop) {
6275 
6276   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
6277   if (!Gep)
6278     return nullptr;
6279 
6280   // We are looking for a gep with all loop invariant indices except for one
6281   // which should be an induction variable.
6282   auto SE = PSE.getSE();
6283   unsigned NumOperands = Gep->getNumOperands();
6284   for (unsigned i = 1; i < NumOperands; ++i) {
6285     Value *Opd = Gep->getOperand(i);
6286     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
6287         !Legal->isInductionVariable(Opd))
6288       return nullptr;
6289   }
6290 
6291   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
6292   return PSE.getSCEV(Ptr);
6293 }
6294 
6295 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
6296   return Legal->hasStride(I->getOperand(0)) ||
6297          Legal->hasStride(I->getOperand(1));
6298 }
6299 
6300 InstructionCost
6301 LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
6302                                                         ElementCount VF) {
6303   assert(VF.isVector() &&
6304          "Scalarization cost of instruction implies vectorization.");
6305   if (VF.isScalable())
6306     return InstructionCost::getInvalid();
6307 
6308   Type *ValTy = getLoadStoreType(I);
6309   auto SE = PSE.getSE();
6310 
6311   unsigned AS = getLoadStoreAddressSpace(I);
6312   Value *Ptr = getLoadStorePointerOperand(I);
6313   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
6314   // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
6315   //       that it is being called from this specific place.
6316 
6317   // Figure out whether the access is strided and get the stride value
6318   // if it's known in compile time
6319   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
6320 
6321   // Get the cost of the scalar memory instruction and address computation.
6322   InstructionCost Cost =
6323       VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
6324 
6325   // Don't pass *I here, since it is scalar but will actually be part of a
6326   // vectorized loop where the user of it is a vectorized instruction.
6327   const Align Alignment = getLoadStoreAlignment(I);
6328   Cost += VF.getKnownMinValue() *
6329           TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
6330                               AS, TTI::TCK_RecipThroughput);
6331 
6332   // Get the overhead of the extractelement and insertelement instructions
6333   // we might create due to scalarization.
6334   Cost += getScalarizationOverhead(I, VF);
6335 
6336   // If we have a predicated load/store, it will need extra i1 extracts and
6337   // conditional branches, but may not be executed for each vector lane. Scale
6338   // the cost by the probability of executing the predicated block.
6339   if (isPredicatedInst(I, VF)) {
6340     Cost /= getReciprocalPredBlockProb();
6341 
6342     // Add the cost of an i1 extract and a branch
6343     auto *Vec_i1Ty =
6344         VectorType::get(IntegerType::getInt1Ty(ValTy->getContext()), VF);
6345     Cost += TTI.getScalarizationOverhead(
6346         Vec_i1Ty, APInt::getAllOnes(VF.getKnownMinValue()),
6347         /*Insert=*/false, /*Extract=*/true);
6348     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
6349 
6350     if (useEmulatedMaskMemRefHack(I, VF))
6351       // Artificially setting to a high enough value to practically disable
6352       // vectorization with such operations.
6353       Cost = 3000000;
6354   }
6355 
6356   return Cost;
6357 }
6358 
6359 InstructionCost
6360 LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
6361                                                     ElementCount VF) {
6362   Type *ValTy = getLoadStoreType(I);
6363   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6364   Value *Ptr = getLoadStorePointerOperand(I);
6365   unsigned AS = getLoadStoreAddressSpace(I);
6366   int ConsecutiveStride = Legal->isConsecutivePtr(ValTy, Ptr);
6367   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6368 
6369   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6370          "Stride should be 1 or -1 for consecutive memory access");
6371   const Align Alignment = getLoadStoreAlignment(I);
6372   InstructionCost Cost = 0;
6373   if (Legal->isMaskRequired(I))
6374     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6375                                       CostKind);
6376   else
6377     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
6378                                 CostKind, I);
6379 
6380   bool Reverse = ConsecutiveStride < 0;
6381   if (Reverse)
6382     Cost +=
6383         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6384   return Cost;
6385 }
6386 
6387 InstructionCost
6388 LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
6389                                                 ElementCount VF) {
6390   assert(Legal->isUniformMemOp(*I));
6391 
6392   Type *ValTy = getLoadStoreType(I);
6393   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6394   const Align Alignment = getLoadStoreAlignment(I);
6395   unsigned AS = getLoadStoreAddressSpace(I);
6396   enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6397   if (isa<LoadInst>(I)) {
6398     return TTI.getAddressComputationCost(ValTy) +
6399            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
6400                                CostKind) +
6401            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
6402   }
6403   StoreInst *SI = cast<StoreInst>(I);
6404 
6405   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
6406   return TTI.getAddressComputationCost(ValTy) +
6407          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
6408                              CostKind) +
6409          (isLoopInvariantStoreValue
6410               ? 0
6411               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
6412                                        VF.getKnownMinValue() - 1));
6413 }
6414 
6415 InstructionCost
6416 LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
6417                                                  ElementCount VF) {
6418   Type *ValTy = getLoadStoreType(I);
6419   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6420   const Align Alignment = getLoadStoreAlignment(I);
6421   const Value *Ptr = getLoadStorePointerOperand(I);
6422 
6423   return TTI.getAddressComputationCost(VectorTy) +
6424          TTI.getGatherScatterOpCost(
6425              I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
6426              TargetTransformInfo::TCK_RecipThroughput, I);
6427 }
6428 
6429 InstructionCost
6430 LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
6431                                                    ElementCount VF) {
6432   // TODO: Once we have support for interleaving with scalable vectors
6433   // we can calculate the cost properly here.
6434   if (VF.isScalable())
6435     return InstructionCost::getInvalid();
6436 
6437   Type *ValTy = getLoadStoreType(I);
6438   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
6439   unsigned AS = getLoadStoreAddressSpace(I);
6440 
6441   auto Group = getInterleavedAccessGroup(I);
6442   assert(Group && "Fail to get an interleaved access group.");
6443 
6444   unsigned InterleaveFactor = Group->getFactor();
6445   auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
6446 
6447   // Holds the indices of existing members in the interleaved group.
6448   SmallVector<unsigned, 4> Indices;
6449   for (unsigned IF = 0; IF < InterleaveFactor; IF++)
6450     if (Group->getMember(IF))
6451       Indices.push_back(IF);
6452 
6453   // Calculate the cost of the whole interleaved group.
6454   bool UseMaskForGaps =
6455       (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
6456       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
6457   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
6458       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
6459       AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
6460 
6461   if (Group->isReverse()) {
6462     // TODO: Add support for reversed masked interleaved access.
6463     assert(!Legal->isMaskRequired(I) &&
6464            "Reverse masked interleaved access not supported.");
6465     Cost +=
6466         Group->getNumMembers() *
6467         TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, None, 0);
6468   }
6469   return Cost;
6470 }
6471 
6472 Optional<InstructionCost> LoopVectorizationCostModel::getReductionPatternCost(
6473     Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
6474   using namespace llvm::PatternMatch;
6475   // Early exit for no inloop reductions
6476   if (InLoopReductionChains.empty() || VF.isScalar() || !isa<VectorType>(Ty))
6477     return None;
6478   auto *VectorTy = cast<VectorType>(Ty);
6479 
6480   // We are looking for a pattern of, and finding the minimal acceptable cost:
6481   //  reduce(mul(ext(A), ext(B))) or
6482   //  reduce(mul(A, B)) or
6483   //  reduce(ext(A)) or
6484   //  reduce(A).
6485   // The basic idea is that we walk down the tree to do that, finding the root
6486   // reduction instruction in InLoopReductionImmediateChains. From there we find
6487   // the pattern of mul/ext and test the cost of the entire pattern vs the cost
6488   // of the components. If the reduction cost is lower then we return it for the
6489   // reduction instruction and 0 for the other instructions in the pattern. If
6490   // it is not we return an invalid cost specifying the orignal cost method
6491   // should be used.
6492   Instruction *RetI = I;
6493   if (match(RetI, m_ZExtOrSExt(m_Value()))) {
6494     if (!RetI->hasOneUser())
6495       return None;
6496     RetI = RetI->user_back();
6497   }
6498   if (match(RetI, m_Mul(m_Value(), m_Value())) &&
6499       RetI->user_back()->getOpcode() == Instruction::Add) {
6500     if (!RetI->hasOneUser())
6501       return None;
6502     RetI = RetI->user_back();
6503   }
6504 
6505   // Test if the found instruction is a reduction, and if not return an invalid
6506   // cost specifying the parent to use the original cost modelling.
6507   if (!InLoopReductionImmediateChains.count(RetI))
6508     return None;
6509 
6510   // Find the reduction this chain is a part of and calculate the basic cost of
6511   // the reduction on its own.
6512   Instruction *LastChain = InLoopReductionImmediateChains[RetI];
6513   Instruction *ReductionPhi = LastChain;
6514   while (!isa<PHINode>(ReductionPhi))
6515     ReductionPhi = InLoopReductionImmediateChains[ReductionPhi];
6516 
6517   const RecurrenceDescriptor &RdxDesc =
6518       Legal->getReductionVars().find(cast<PHINode>(ReductionPhi))->second;
6519 
6520   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
6521       RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
6522 
6523   // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
6524   // normal fmul instruction to the cost of the fadd reduction.
6525   if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
6526     BaseCost +=
6527         TTI.getArithmeticInstrCost(Instruction::FMul, VectorTy, CostKind);
6528 
6529   // If we're using ordered reductions then we can just return the base cost
6530   // here, since getArithmeticReductionCost calculates the full ordered
6531   // reduction cost when FP reassociation is not allowed.
6532   if (useOrderedReductions(RdxDesc))
6533     return BaseCost;
6534 
6535   // Get the operand that was not the reduction chain and match it to one of the
6536   // patterns, returning the better cost if it is found.
6537   Instruction *RedOp = RetI->getOperand(1) == LastChain
6538                            ? dyn_cast<Instruction>(RetI->getOperand(0))
6539                            : dyn_cast<Instruction>(RetI->getOperand(1));
6540 
6541   VectorTy = VectorType::get(I->getOperand(0)->getType(), VectorTy);
6542 
6543   Instruction *Op0, *Op1;
6544   if (RedOp &&
6545       match(RedOp,
6546             m_ZExtOrSExt(m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) &&
6547       match(Op0, m_ZExtOrSExt(m_Value())) &&
6548       Op0->getOpcode() == Op1->getOpcode() &&
6549       Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
6550       !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1) &&
6551       (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
6552 
6553     // Matched reduce(ext(mul(ext(A), ext(B)))
6554     // Note that the extend opcodes need to all match, or if A==B they will have
6555     // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
6556     // which is equally fine.
6557     bool IsUnsigned = isa<ZExtInst>(Op0);
6558     auto *ExtType = VectorType::get(Op0->getOperand(0)->getType(), VectorTy);
6559     auto *MulType = VectorType::get(Op0->getType(), VectorTy);
6560 
6561     InstructionCost ExtCost =
6562         TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
6563                              TTI::CastContextHint::None, CostKind, Op0);
6564     InstructionCost MulCost =
6565         TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
6566     InstructionCost Ext2Cost =
6567         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, MulType,
6568                              TTI::CastContextHint::None, CostKind, RedOp);
6569 
6570     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6571         /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6572         CostKind);
6573 
6574     if (RedCost.isValid() &&
6575         RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
6576       return I == RetI ? RedCost : 0;
6577   } else if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value())) &&
6578              !TheLoop->isLoopInvariant(RedOp)) {
6579     // Matched reduce(ext(A))
6580     bool IsUnsigned = isa<ZExtInst>(RedOp);
6581     auto *ExtType = VectorType::get(RedOp->getOperand(0)->getType(), VectorTy);
6582     InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6583         /*IsMLA=*/false, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6584         CostKind);
6585 
6586     InstructionCost ExtCost =
6587         TTI.getCastInstrCost(RedOp->getOpcode(), VectorTy, ExtType,
6588                              TTI::CastContextHint::None, CostKind, RedOp);
6589     if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
6590       return I == RetI ? RedCost : 0;
6591   } else if (RedOp &&
6592              match(RedOp, m_Mul(m_Instruction(Op0), m_Instruction(Op1)))) {
6593     if (match(Op0, m_ZExtOrSExt(m_Value())) &&
6594         Op0->getOpcode() == Op1->getOpcode() &&
6595         !TheLoop->isLoopInvariant(Op0) && !TheLoop->isLoopInvariant(Op1)) {
6596       bool IsUnsigned = isa<ZExtInst>(Op0);
6597       Type *Op0Ty = Op0->getOperand(0)->getType();
6598       Type *Op1Ty = Op1->getOperand(0)->getType();
6599       Type *LargestOpTy =
6600           Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
6601                                                                     : Op0Ty;
6602       auto *ExtType = VectorType::get(LargestOpTy, VectorTy);
6603 
6604       // Matched reduce(mul(ext(A), ext(B))), where the two ext may be of
6605       // different sizes. We take the largest type as the ext to reduce, and add
6606       // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
6607       InstructionCost ExtCost0 = TTI.getCastInstrCost(
6608           Op0->getOpcode(), VectorTy, VectorType::get(Op0Ty, VectorTy),
6609           TTI::CastContextHint::None, CostKind, Op0);
6610       InstructionCost ExtCost1 = TTI.getCastInstrCost(
6611           Op1->getOpcode(), VectorTy, VectorType::get(Op1Ty, VectorTy),
6612           TTI::CastContextHint::None, CostKind, Op1);
6613       InstructionCost MulCost =
6614           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6615 
6616       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6617           /*IsMLA=*/true, IsUnsigned, RdxDesc.getRecurrenceType(), ExtType,
6618           CostKind);
6619       InstructionCost ExtraExtCost = 0;
6620       if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
6621         Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
6622         ExtraExtCost = TTI.getCastInstrCost(
6623             ExtraExtOp->getOpcode(), ExtType,
6624             VectorType::get(ExtraExtOp->getOperand(0)->getType(), VectorTy),
6625             TTI::CastContextHint::None, CostKind, ExtraExtOp);
6626       }
6627 
6628       if (RedCost.isValid() &&
6629           (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
6630         return I == RetI ? RedCost : 0;
6631     } else if (!match(I, m_ZExtOrSExt(m_Value()))) {
6632       // Matched reduce(mul())
6633       InstructionCost MulCost =
6634           TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
6635 
6636       InstructionCost RedCost = TTI.getExtendedAddReductionCost(
6637           /*IsMLA=*/true, true, RdxDesc.getRecurrenceType(), VectorTy,
6638           CostKind);
6639 
6640       if (RedCost.isValid() && RedCost < MulCost + BaseCost)
6641         return I == RetI ? RedCost : 0;
6642     }
6643   }
6644 
6645   return I == RetI ? Optional<InstructionCost>(BaseCost) : None;
6646 }
6647 
6648 InstructionCost
6649 LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6650                                                      ElementCount VF) {
6651   // Calculate scalar cost only. Vectorization cost should be ready at this
6652   // moment.
6653   if (VF.isScalar()) {
6654     Type *ValTy = getLoadStoreType(I);
6655     const Align Alignment = getLoadStoreAlignment(I);
6656     unsigned AS = getLoadStoreAddressSpace(I);
6657 
6658     return TTI.getAddressComputationCost(ValTy) +
6659            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
6660                                TTI::TCK_RecipThroughput, I);
6661   }
6662   return getWideningCost(I, VF);
6663 }
6664 
6665 LoopVectorizationCostModel::VectorizationCostTy
6666 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6667                                                ElementCount VF) {
6668   // If we know that this instruction will remain uniform, check the cost of
6669   // the scalar version.
6670   if (isUniformAfterVectorization(I, VF))
6671     VF = ElementCount::getFixed(1);
6672 
6673   if (VF.isVector() && isProfitableToScalarize(I, VF))
6674     return VectorizationCostTy(InstsToScalarize[VF][I], false);
6675 
6676   // Forced scalars do not have any scalarization overhead.
6677   auto ForcedScalar = ForcedScalars.find(VF);
6678   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6679     auto InstSet = ForcedScalar->second;
6680     if (InstSet.count(I))
6681       return VectorizationCostTy(
6682           (getInstructionCost(I, ElementCount::getFixed(1)).first *
6683            VF.getKnownMinValue()),
6684           false);
6685   }
6686 
6687   Type *VectorTy;
6688   InstructionCost C = getInstructionCost(I, VF, VectorTy);
6689 
6690   bool TypeNotScalarized = false;
6691   if (VF.isVector() && VectorTy->isVectorTy()) {
6692     if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6693       if (VF.isScalable())
6694         // <vscale x 1 x iN> is assumed to be profitable over iN because
6695         // scalable registers are a distinct register class from scalar ones.
6696         // If we ever find a target which wants to lower scalable vectors
6697         // back to scalars, we'll need to update this code to explicitly
6698         // ask TTI about the register class uses for each part.
6699         TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6700       else
6701         TypeNotScalarized = NumParts < VF.getKnownMinValue();
6702     } else
6703       C = InstructionCost::getInvalid();
6704   }
6705   return VectorizationCostTy(C, TypeNotScalarized);
6706 }
6707 
6708 InstructionCost
6709 LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
6710                                                      ElementCount VF) const {
6711 
6712   // There is no mechanism yet to create a scalable scalarization loop,
6713   // so this is currently Invalid.
6714   if (VF.isScalable())
6715     return InstructionCost::getInvalid();
6716 
6717   if (VF.isScalar())
6718     return 0;
6719 
6720   InstructionCost Cost = 0;
6721   Type *RetTy = ToVectorTy(I->getType(), VF);
6722   if (!RetTy->isVoidTy() &&
6723       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
6724     Cost += TTI.getScalarizationOverhead(
6725         cast<VectorType>(RetTy), APInt::getAllOnes(VF.getKnownMinValue()), true,
6726         false);
6727 
6728   // Some targets keep addresses scalar.
6729   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
6730     return Cost;
6731 
6732   // Some targets support efficient element stores.
6733   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
6734     return Cost;
6735 
6736   // Collect operands to consider.
6737   CallInst *CI = dyn_cast<CallInst>(I);
6738   Instruction::op_range Ops = CI ? CI->args() : I->operands();
6739 
6740   // Skip operands that do not require extraction/scalarization and do not incur
6741   // any overhead.
6742   SmallVector<Type *> Tys;
6743   for (auto *V : filterExtractingOperands(Ops, VF))
6744     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
6745   return Cost + TTI.getOperandsScalarizationOverhead(
6746                     filterExtractingOperands(Ops, VF), Tys);
6747 }
6748 
6749 void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
6750   if (VF.isScalar())
6751     return;
6752   NumPredStores = 0;
6753   for (BasicBlock *BB : TheLoop->blocks()) {
6754     // For each instruction in the old loop.
6755     for (Instruction &I : *BB) {
6756       Value *Ptr =  getLoadStorePointerOperand(&I);
6757       if (!Ptr)
6758         continue;
6759 
6760       // TODO: We should generate better code and update the cost model for
6761       // predicated uniform stores. Today they are treated as any other
6762       // predicated store (see added test cases in
6763       // invariant-store-vectorization.ll).
6764       if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
6765         NumPredStores++;
6766 
6767       if (Legal->isUniformMemOp(I)) {
6768         // Lowering story for uniform memory ops is currently a bit complicated.
6769         // Scalarization works for everything which isn't a store with scalable
6770         // VF.  Fixed len VFs just scalarize and then DCE later; scalarization
6771         // knows how to handle uniform-per-part values (i.e. the first lane
6772         // in each unrolled VF) and can thus handle scalable loads too.  For
6773         // scalable stores, we use a scatter if legal.  If not, we have no way
6774         // to lower (currently) and thus have to abort vectorization.
6775         if (isa<StoreInst>(&I) && VF.isScalable()) {
6776           if (isLegalGatherOrScatter(&I, VF))
6777             setWideningDecision(&I, VF, CM_GatherScatter,
6778                                 getGatherScatterCost(&I, VF));
6779           else
6780             // Error case, abort vectorization
6781             setWideningDecision(&I, VF, CM_Scalarize,
6782                                 InstructionCost::getInvalid());
6783           continue;
6784         }
6785         // Load: Scalar load + broadcast
6786         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6787         // TODO: Avoid replicating loads and stores instead of relying on
6788         // instcombine to remove them.
6789         setWideningDecision(&I, VF, CM_Scalarize,
6790                             getUniformMemOpCost(&I, VF));
6791         continue;
6792       }
6793 
6794       // We assume that widening is the best solution when possible.
6795       if (memoryInstructionCanBeWidened(&I, VF)) {
6796         InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
6797         int ConsecutiveStride = Legal->isConsecutivePtr(
6798             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
6799         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6800                "Expected consecutive stride.");
6801         InstWidening Decision =
6802             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6803         setWideningDecision(&I, VF, Decision, Cost);
6804         continue;
6805       }
6806 
6807       // Choose between Interleaving, Gather/Scatter or Scalarization.
6808       InstructionCost InterleaveCost = InstructionCost::getInvalid();
6809       unsigned NumAccesses = 1;
6810       if (isAccessInterleaved(&I)) {
6811         auto Group = getInterleavedAccessGroup(&I);
6812         assert(Group && "Fail to get an interleaved access group.");
6813 
6814         // Make one decision for the whole group.
6815         if (getWideningDecision(&I, VF) != CM_Unknown)
6816           continue;
6817 
6818         NumAccesses = Group->getNumMembers();
6819         if (interleavedAccessCanBeWidened(&I, VF))
6820           InterleaveCost = getInterleaveGroupCost(&I, VF);
6821       }
6822 
6823       InstructionCost GatherScatterCost =
6824           isLegalGatherOrScatter(&I, VF)
6825               ? getGatherScatterCost(&I, VF) * NumAccesses
6826               : InstructionCost::getInvalid();
6827 
6828       InstructionCost ScalarizationCost =
6829           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6830 
6831       // Choose better solution for the current VF,
6832       // write down this decision and use it during vectorization.
6833       InstructionCost Cost;
6834       InstWidening Decision;
6835       if (InterleaveCost <= GatherScatterCost &&
6836           InterleaveCost < ScalarizationCost) {
6837         Decision = CM_Interleave;
6838         Cost = InterleaveCost;
6839       } else if (GatherScatterCost < ScalarizationCost) {
6840         Decision = CM_GatherScatter;
6841         Cost = GatherScatterCost;
6842       } else {
6843         Decision = CM_Scalarize;
6844         Cost = ScalarizationCost;
6845       }
6846       // If the instructions belongs to an interleave group, the whole group
6847       // receives the same decision. The whole group receives the cost, but
6848       // the cost will actually be assigned to one instruction.
6849       if (auto Group = getInterleavedAccessGroup(&I))
6850         setWideningDecision(Group, VF, Decision, Cost);
6851       else
6852         setWideningDecision(&I, VF, Decision, Cost);
6853     }
6854   }
6855 
6856   // Make sure that any load of address and any other address computation
6857   // remains scalar unless there is gather/scatter support. This avoids
6858   // inevitable extracts into address registers, and also has the benefit of
6859   // activating LSR more, since that pass can't optimize vectorized
6860   // addresses.
6861   if (TTI.prefersVectorizedAddressing())
6862     return;
6863 
6864   // Start with all scalar pointer uses.
6865   SmallPtrSet<Instruction *, 8> AddrDefs;
6866   for (BasicBlock *BB : TheLoop->blocks())
6867     for (Instruction &I : *BB) {
6868       Instruction *PtrDef =
6869         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6870       if (PtrDef && TheLoop->contains(PtrDef) &&
6871           getWideningDecision(&I, VF) != CM_GatherScatter)
6872         AddrDefs.insert(PtrDef);
6873     }
6874 
6875   // Add all instructions used to generate the addresses.
6876   SmallVector<Instruction *, 4> Worklist;
6877   append_range(Worklist, AddrDefs);
6878   while (!Worklist.empty()) {
6879     Instruction *I = Worklist.pop_back_val();
6880     for (auto &Op : I->operands())
6881       if (auto *InstOp = dyn_cast<Instruction>(Op))
6882         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6883             AddrDefs.insert(InstOp).second)
6884           Worklist.push_back(InstOp);
6885   }
6886 
6887   for (auto *I : AddrDefs) {
6888     if (isa<LoadInst>(I)) {
6889       // Setting the desired widening decision should ideally be handled in
6890       // by cost functions, but since this involves the task of finding out
6891       // if the loaded register is involved in an address computation, it is
6892       // instead changed here when we know this is the case.
6893       InstWidening Decision = getWideningDecision(I, VF);
6894       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6895         // Scalarize a widened load of address.
6896         setWideningDecision(
6897             I, VF, CM_Scalarize,
6898             (VF.getKnownMinValue() *
6899              getMemoryInstructionCost(I, ElementCount::getFixed(1))));
6900       else if (auto Group = getInterleavedAccessGroup(I)) {
6901         // Scalarize an interleave group of address loads.
6902         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6903           if (Instruction *Member = Group->getMember(I))
6904             setWideningDecision(
6905                 Member, VF, CM_Scalarize,
6906                 (VF.getKnownMinValue() *
6907                  getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
6908         }
6909       }
6910     } else
6911       // Make sure I gets scalarized and a cost estimate without
6912       // scalarization overhead.
6913       ForcedScalars[VF].insert(I);
6914   }
6915 }
6916 
6917 InstructionCost
6918 LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6919                                                Type *&VectorTy) {
6920   Type *RetTy = I->getType();
6921   if (canTruncateToMinimalBitwidth(I, VF))
6922     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6923   auto SE = PSE.getSE();
6924   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
6925 
6926   auto hasSingleCopyAfterVectorization = [this](Instruction *I,
6927                                                 ElementCount VF) -> bool {
6928     if (VF.isScalar())
6929       return true;
6930 
6931     auto Scalarized = InstsToScalarize.find(VF);
6932     assert(Scalarized != InstsToScalarize.end() &&
6933            "VF not yet analyzed for scalarization profitability");
6934     return !Scalarized->second.count(I) &&
6935            llvm::all_of(I->users(), [&](User *U) {
6936              auto *UI = cast<Instruction>(U);
6937              return !Scalarized->second.count(UI);
6938            });
6939   };
6940   (void) hasSingleCopyAfterVectorization;
6941 
6942   if (isScalarAfterVectorization(I, VF)) {
6943     // With the exception of GEPs and PHIs, after scalarization there should
6944     // only be one copy of the instruction generated in the loop. This is
6945     // because the VF is either 1, or any instructions that need scalarizing
6946     // have already been dealt with by the the time we get here. As a result,
6947     // it means we don't have to multiply the instruction cost by VF.
6948     assert(I->getOpcode() == Instruction::GetElementPtr ||
6949            I->getOpcode() == Instruction::PHI ||
6950            (I->getOpcode() == Instruction::BitCast &&
6951             I->getType()->isPointerTy()) ||
6952            hasSingleCopyAfterVectorization(I, VF));
6953     VectorTy = RetTy;
6954   } else
6955     VectorTy = ToVectorTy(RetTy, VF);
6956 
6957   // TODO: We need to estimate the cost of intrinsic calls.
6958   switch (I->getOpcode()) {
6959   case Instruction::GetElementPtr:
6960     // We mark this instruction as zero-cost because the cost of GEPs in
6961     // vectorized code depends on whether the corresponding memory instruction
6962     // is scalarized or not. Therefore, we handle GEPs with the memory
6963     // instruction cost.
6964     return 0;
6965   case Instruction::Br: {
6966     // In cases of scalarized and predicated instructions, there will be VF
6967     // predicated blocks in the vectorized loop. Each branch around these
6968     // blocks requires also an extract of its vector compare i1 element.
6969     bool ScalarPredicatedBB = false;
6970     BranchInst *BI = cast<BranchInst>(I);
6971     if (VF.isVector() && BI->isConditional() &&
6972         (PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(0)) ||
6973          PredicatedBBsAfterVectorization[VF].count(BI->getSuccessor(1))))
6974       ScalarPredicatedBB = true;
6975 
6976     if (ScalarPredicatedBB) {
6977       // Not possible to scalarize scalable vector with predicated instructions.
6978       if (VF.isScalable())
6979         return InstructionCost::getInvalid();
6980       // Return cost for branches around scalarized and predicated blocks.
6981       auto *Vec_i1Ty =
6982           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6983       return (
6984           TTI.getScalarizationOverhead(
6985               Vec_i1Ty, APInt::getAllOnes(VF.getFixedValue()), false, true) +
6986           (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF.getFixedValue()));
6987     } else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6988       // The back-edge branch will remain, as will all scalar branches.
6989       return TTI.getCFInstrCost(Instruction::Br, CostKind);
6990     else
6991       // This branch will be eliminated by if-conversion.
6992       return 0;
6993     // Note: We currently assume zero cost for an unconditional branch inside
6994     // a predicated block since it will become a fall-through, although we
6995     // may decide in the future to call TTI for all branches.
6996   }
6997   case Instruction::PHI: {
6998     auto *Phi = cast<PHINode>(I);
6999 
7000     // First-order recurrences are replaced by vector shuffles inside the loop.
7001     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
7002     if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
7003       return TTI.getShuffleCost(
7004           TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
7005           None, VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
7006 
7007     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
7008     // converted into select instructions. We require N - 1 selects per phi
7009     // node, where N is the number of incoming values.
7010     if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
7011       return (Phi->getNumIncomingValues() - 1) *
7012              TTI.getCmpSelInstrCost(
7013                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
7014                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
7015                  CmpInst::BAD_ICMP_PREDICATE, CostKind);
7016 
7017     return TTI.getCFInstrCost(Instruction::PHI, CostKind);
7018   }
7019   case Instruction::UDiv:
7020   case Instruction::SDiv:
7021   case Instruction::URem:
7022   case Instruction::SRem:
7023     // If we have a predicated instruction, it may not be executed for each
7024     // vector lane. Get the scalarization cost and scale this amount by the
7025     // probability of executing the predicated block. If the instruction is not
7026     // predicated, we fall through to the next case.
7027     if (VF.isVector() && isScalarWithPredication(I, VF)) {
7028       InstructionCost Cost = 0;
7029 
7030       // These instructions have a non-void type, so account for the phi nodes
7031       // that we will create. This cost is likely to be zero. The phi node
7032       // cost, if any, should be scaled by the block probability because it
7033       // models a copy at the end of each predicated block.
7034       Cost += VF.getKnownMinValue() *
7035               TTI.getCFInstrCost(Instruction::PHI, CostKind);
7036 
7037       // The cost of the non-predicated instruction.
7038       Cost += VF.getKnownMinValue() *
7039               TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
7040 
7041       // The cost of insertelement and extractelement instructions needed for
7042       // scalarization.
7043       Cost += getScalarizationOverhead(I, VF);
7044 
7045       // Scale the cost by the probability of executing the predicated blocks.
7046       // This assumes the predicated block for each vector lane is equally
7047       // likely.
7048       return Cost / getReciprocalPredBlockProb();
7049     }
7050     LLVM_FALLTHROUGH;
7051   case Instruction::Add:
7052   case Instruction::FAdd:
7053   case Instruction::Sub:
7054   case Instruction::FSub:
7055   case Instruction::Mul:
7056   case Instruction::FMul:
7057   case Instruction::FDiv:
7058   case Instruction::FRem:
7059   case Instruction::Shl:
7060   case Instruction::LShr:
7061   case Instruction::AShr:
7062   case Instruction::And:
7063   case Instruction::Or:
7064   case Instruction::Xor: {
7065     // Since we will replace the stride by 1 the multiplication should go away.
7066     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
7067       return 0;
7068 
7069     // Detect reduction patterns
7070     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7071       return *RedCost;
7072 
7073     // Certain instructions can be cheaper to vectorize if they have a constant
7074     // second vector operand. One example of this are shifts on x86.
7075     Value *Op2 = I->getOperand(1);
7076     TargetTransformInfo::OperandValueProperties Op2VP;
7077     TargetTransformInfo::OperandValueKind Op2VK =
7078         TTI.getOperandInfo(Op2, Op2VP);
7079     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
7080       Op2VK = TargetTransformInfo::OK_UniformValue;
7081 
7082     SmallVector<const Value *, 4> Operands(I->operand_values());
7083     return TTI.getArithmeticInstrCost(
7084         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7085         Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
7086   }
7087   case Instruction::FNeg: {
7088     return TTI.getArithmeticInstrCost(
7089         I->getOpcode(), VectorTy, CostKind, TargetTransformInfo::OK_AnyValue,
7090         TargetTransformInfo::OK_AnyValue, TargetTransformInfo::OP_None,
7091         TargetTransformInfo::OP_None, I->getOperand(0), I);
7092   }
7093   case Instruction::Select: {
7094     SelectInst *SI = cast<SelectInst>(I);
7095     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
7096     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
7097 
7098     const Value *Op0, *Op1;
7099     using namespace llvm::PatternMatch;
7100     if (!ScalarCond && (match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1))) ||
7101                         match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1))))) {
7102       // select x, y, false --> x & y
7103       // select x, true, y --> x | y
7104       TTI::OperandValueProperties Op1VP = TTI::OP_None;
7105       TTI::OperandValueProperties Op2VP = TTI::OP_None;
7106       TTI::OperandValueKind Op1VK = TTI::getOperandInfo(Op0, Op1VP);
7107       TTI::OperandValueKind Op2VK = TTI::getOperandInfo(Op1, Op2VP);
7108       assert(Op0->getType()->getScalarSizeInBits() == 1 &&
7109               Op1->getType()->getScalarSizeInBits() == 1);
7110 
7111       SmallVector<const Value *, 2> Operands{Op0, Op1};
7112       return TTI.getArithmeticInstrCost(
7113           match(I, m_LogicalOr()) ? Instruction::Or : Instruction::And, VectorTy,
7114           CostKind, Op1VK, Op2VK, Op1VP, Op2VP, Operands, I);
7115     }
7116 
7117     Type *CondTy = SI->getCondition()->getType();
7118     if (!ScalarCond)
7119       CondTy = VectorType::get(CondTy, VF);
7120 
7121     CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
7122     if (auto *Cmp = dyn_cast<CmpInst>(SI->getCondition()))
7123       Pred = Cmp->getPredicate();
7124     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, Pred,
7125                                   CostKind, I);
7126   }
7127   case Instruction::ICmp:
7128   case Instruction::FCmp: {
7129     Type *ValTy = I->getOperand(0)->getType();
7130     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
7131     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
7132       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
7133     VectorTy = ToVectorTy(ValTy, VF);
7134     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
7135                                   cast<CmpInst>(I)->getPredicate(), CostKind,
7136                                   I);
7137   }
7138   case Instruction::Store:
7139   case Instruction::Load: {
7140     ElementCount Width = VF;
7141     if (Width.isVector()) {
7142       InstWidening Decision = getWideningDecision(I, Width);
7143       assert(Decision != CM_Unknown &&
7144              "CM decision should be taken at this point");
7145       if (getWideningCost(I, VF) == InstructionCost::getInvalid())
7146         return InstructionCost::getInvalid();
7147       if (Decision == CM_Scalarize)
7148         Width = ElementCount::getFixed(1);
7149     }
7150     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
7151     return getMemoryInstructionCost(I, VF);
7152   }
7153   case Instruction::BitCast:
7154     if (I->getType()->isPointerTy())
7155       return 0;
7156     LLVM_FALLTHROUGH;
7157   case Instruction::ZExt:
7158   case Instruction::SExt:
7159   case Instruction::FPToUI:
7160   case Instruction::FPToSI:
7161   case Instruction::FPExt:
7162   case Instruction::PtrToInt:
7163   case Instruction::IntToPtr:
7164   case Instruction::SIToFP:
7165   case Instruction::UIToFP:
7166   case Instruction::Trunc:
7167   case Instruction::FPTrunc: {
7168     // Computes the CastContextHint from a Load/Store instruction.
7169     auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
7170       assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7171              "Expected a load or a store!");
7172 
7173       if (VF.isScalar() || !TheLoop->contains(I))
7174         return TTI::CastContextHint::Normal;
7175 
7176       switch (getWideningDecision(I, VF)) {
7177       case LoopVectorizationCostModel::CM_GatherScatter:
7178         return TTI::CastContextHint::GatherScatter;
7179       case LoopVectorizationCostModel::CM_Interleave:
7180         return TTI::CastContextHint::Interleave;
7181       case LoopVectorizationCostModel::CM_Scalarize:
7182       case LoopVectorizationCostModel::CM_Widen:
7183         return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
7184                                         : TTI::CastContextHint::Normal;
7185       case LoopVectorizationCostModel::CM_Widen_Reverse:
7186         return TTI::CastContextHint::Reversed;
7187       case LoopVectorizationCostModel::CM_Unknown:
7188         llvm_unreachable("Instr did not go through cost modelling?");
7189       }
7190 
7191       llvm_unreachable("Unhandled case!");
7192     };
7193 
7194     unsigned Opcode = I->getOpcode();
7195     TTI::CastContextHint CCH = TTI::CastContextHint::None;
7196     // For Trunc, the context is the only user, which must be a StoreInst.
7197     if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
7198       if (I->hasOneUse())
7199         if (StoreInst *Store = dyn_cast<StoreInst>(*I->user_begin()))
7200           CCH = ComputeCCH(Store);
7201     }
7202     // For Z/Sext, the context is the operand, which must be a LoadInst.
7203     else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
7204              Opcode == Instruction::FPExt) {
7205       if (LoadInst *Load = dyn_cast<LoadInst>(I->getOperand(0)))
7206         CCH = ComputeCCH(Load);
7207     }
7208 
7209     // We optimize the truncation of induction variables having constant
7210     // integer steps. The cost of these truncations is the same as the scalar
7211     // operation.
7212     if (isOptimizableIVTruncate(I, VF)) {
7213       auto *Trunc = cast<TruncInst>(I);
7214       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
7215                                   Trunc->getSrcTy(), CCH, CostKind, Trunc);
7216     }
7217 
7218     // Detect reduction patterns
7219     if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7220       return *RedCost;
7221 
7222     Type *SrcScalarTy = I->getOperand(0)->getType();
7223     Type *SrcVecTy =
7224         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
7225     if (canTruncateToMinimalBitwidth(I, VF)) {
7226       // This cast is going to be shrunk. This may remove the cast or it might
7227       // turn it into slightly different cast. For example, if MinBW == 16,
7228       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
7229       //
7230       // Calculate the modified src and dest types.
7231       Type *MinVecTy = VectorTy;
7232       if (Opcode == Instruction::Trunc) {
7233         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
7234         VectorTy =
7235             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7236       } else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
7237         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
7238         VectorTy =
7239             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
7240       }
7241     }
7242 
7243     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
7244   }
7245   case Instruction::Call: {
7246     if (RecurrenceDescriptor::isFMulAddIntrinsic(I))
7247       if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
7248         return *RedCost;
7249     bool NeedToScalarize;
7250     CallInst *CI = cast<CallInst>(I);
7251     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
7252     if (getVectorIntrinsicIDForCall(CI, TLI)) {
7253       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
7254       return std::min(CallCost, IntrinsicCost);
7255     }
7256     return CallCost;
7257   }
7258   case Instruction::ExtractValue:
7259     return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
7260   case Instruction::Alloca:
7261     // We cannot easily widen alloca to a scalable alloca, as
7262     // the result would need to be a vector of pointers.
7263     if (VF.isScalable())
7264       return InstructionCost::getInvalid();
7265     LLVM_FALLTHROUGH;
7266   default:
7267     // This opcode is unknown. Assume that it is the same as 'mul'.
7268     return TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy, CostKind);
7269   } // end of switch.
7270 }
7271 
7272 char LoopVectorize::ID = 0;
7273 
7274 static const char lv_name[] = "Loop Vectorization";
7275 
7276 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
7277 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
7278 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
7279 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
7280 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
7281 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
7282 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
7283 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
7284 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
7285 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
7286 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
7287 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
7288 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
7289 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
7290 INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
7291 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
7292 
7293 namespace llvm {
7294 
7295 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
7296 
7297 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
7298                               bool VectorizeOnlyWhenForced) {
7299   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
7300 }
7301 
7302 } // end namespace llvm
7303 
7304 void LoopVectorizationCostModel::collectValuesToIgnore() {
7305   // Ignore ephemeral values.
7306   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
7307 
7308   // Find all stores to invariant variables. Since they are going to sink
7309   // outside the loop we do not need calculate cost for them.
7310   for (BasicBlock *BB : TheLoop->blocks())
7311     for (Instruction &I : *BB) {
7312       StoreInst *SI;
7313       if ((SI = dyn_cast<StoreInst>(&I)) &&
7314           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
7315         ValuesToIgnore.insert(&I);
7316     }
7317 
7318   // Ignore type-promoting instructions we identified during reduction
7319   // detection.
7320   for (auto &Reduction : Legal->getReductionVars()) {
7321     const RecurrenceDescriptor &RedDes = Reduction.second;
7322     const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
7323     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7324   }
7325   // Ignore type-casting instructions we identified during induction
7326   // detection.
7327   for (auto &Induction : Legal->getInductionVars()) {
7328     const InductionDescriptor &IndDes = Induction.second;
7329     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
7330     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
7331   }
7332 }
7333 
7334 void LoopVectorizationCostModel::collectInLoopReductions() {
7335   for (auto &Reduction : Legal->getReductionVars()) {
7336     PHINode *Phi = Reduction.first;
7337     const RecurrenceDescriptor &RdxDesc = Reduction.second;
7338 
7339     // We don't collect reductions that are type promoted (yet).
7340     if (RdxDesc.getRecurrenceType() != Phi->getType())
7341       continue;
7342 
7343     // If the target would prefer this reduction to happen "in-loop", then we
7344     // want to record it as such.
7345     unsigned Opcode = RdxDesc.getOpcode();
7346     if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
7347         !TTI.preferInLoopReduction(Opcode, Phi->getType(),
7348                                    TargetTransformInfo::ReductionFlags()))
7349       continue;
7350 
7351     // Check that we can correctly put the reductions into the loop, by
7352     // finding the chain of operations that leads from the phi to the loop
7353     // exit value.
7354     SmallVector<Instruction *, 4> ReductionOperations =
7355         RdxDesc.getReductionOpChain(Phi, TheLoop);
7356     bool InLoop = !ReductionOperations.empty();
7357     if (InLoop) {
7358       InLoopReductionChains[Phi] = ReductionOperations;
7359       // Add the elements to InLoopReductionImmediateChains for cost modelling.
7360       Instruction *LastChain = Phi;
7361       for (auto *I : ReductionOperations) {
7362         InLoopReductionImmediateChains[I] = LastChain;
7363         LastChain = I;
7364       }
7365     }
7366     LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
7367                       << " reduction for phi: " << *Phi << "\n");
7368   }
7369 }
7370 
7371 // TODO: we could return a pair of values that specify the max VF and
7372 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
7373 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
7374 // doesn't have a cost model that can choose which plan to execute if
7375 // more than one is generated.
7376 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
7377                                  LoopVectorizationCostModel &CM) {
7378   unsigned WidestType;
7379   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
7380   return WidestVectorRegBits / WidestType;
7381 }
7382 
7383 VectorizationFactor
7384 LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
7385   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
7386   ElementCount VF = UserVF;
7387   // Outer loop handling: They may require CFG and instruction level
7388   // transformations before even evaluating whether vectorization is profitable.
7389   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7390   // the vectorization pipeline.
7391   if (!OrigLoop->isInnermost()) {
7392     // If the user doesn't provide a vectorization factor, determine a
7393     // reasonable one.
7394     if (UserVF.isZero()) {
7395       VF = ElementCount::getFixed(determineVPlanVF(
7396           TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
7397               .getFixedSize(),
7398           CM));
7399       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
7400 
7401       // Make sure we have a VF > 1 for stress testing.
7402       if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
7403         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
7404                           << "overriding computed VF.\n");
7405         VF = ElementCount::getFixed(4);
7406       }
7407     }
7408     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7409     assert(isPowerOf2_32(VF.getKnownMinValue()) &&
7410            "VF needs to be a power of two");
7411     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
7412                       << "VF " << VF << " to build VPlans.\n");
7413     buildVPlans(VF, VF);
7414 
7415     // For VPlan build stress testing, we bail out after VPlan construction.
7416     if (VPlanBuildStressTest)
7417       return VectorizationFactor::Disabled();
7418 
7419     return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
7420   }
7421 
7422   LLVM_DEBUG(
7423       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
7424                 "VPlan-native path.\n");
7425   return VectorizationFactor::Disabled();
7426 }
7427 
7428 Optional<VectorizationFactor>
7429 LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7430   assert(OrigLoop->isInnermost() && "Inner loop expected.");
7431   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
7432   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
7433     return None;
7434 
7435   // Invalidate interleave groups if all blocks of loop will be predicated.
7436   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
7437       !useMaskedInterleavedAccesses(*TTI)) {
7438     LLVM_DEBUG(
7439         dbgs()
7440         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
7441            "which requires masked-interleaved support.\n");
7442     if (CM.InterleaveInfo.invalidateGroups())
7443       // Invalidating interleave groups also requires invalidating all decisions
7444       // based on them, which includes widening decisions and uniform and scalar
7445       // values.
7446       CM.invalidateCostModelingDecisions();
7447   }
7448 
7449   ElementCount MaxUserVF =
7450       UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
7451   bool UserVFIsLegal = ElementCount::isKnownLE(UserVF, MaxUserVF);
7452   if (!UserVF.isZero() && UserVFIsLegal) {
7453     assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
7454            "VF needs to be a power of two");
7455     // Collect the instructions (and their associated costs) that will be more
7456     // profitable to scalarize.
7457     if (CM.selectUserVectorizationFactor(UserVF)) {
7458       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7459       CM.collectInLoopReductions();
7460       buildVPlansWithVPRecipes(UserVF, UserVF);
7461       LLVM_DEBUG(printPlans(dbgs()));
7462       return {{UserVF, 0, 0}};
7463     } else
7464       reportVectorizationInfo("UserVF ignored because of invalid costs.",
7465                               "InvalidCost", ORE, OrigLoop);
7466   }
7467 
7468   // Populate the set of Vectorization Factor Candidates.
7469   ElementCountSet VFCandidates;
7470   for (auto VF = ElementCount::getFixed(1);
7471        ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
7472     VFCandidates.insert(VF);
7473   for (auto VF = ElementCount::getScalable(1);
7474        ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
7475     VFCandidates.insert(VF);
7476 
7477   for (const auto &VF : VFCandidates) {
7478     // Collect Uniform and Scalar instructions after vectorization with VF.
7479     CM.collectUniformsAndScalars(VF);
7480 
7481     // Collect the instructions (and their associated costs) that will be more
7482     // profitable to scalarize.
7483     if (VF.isVector())
7484       CM.collectInstsToScalarize(VF);
7485   }
7486 
7487   CM.collectInLoopReductions();
7488   buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7489   buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7490 
7491   LLVM_DEBUG(printPlans(dbgs()));
7492   if (!MaxFactors.hasVector())
7493     return VectorizationFactor::Disabled();
7494 
7495   // Select the optimal vectorization factor.
7496   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
7497   assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
7498   return VF;
7499 }
7500 
7501 VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
7502   assert(count_if(VPlans,
7503                   [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
7504              1 &&
7505          "Best VF has not a single VPlan.");
7506 
7507   for (const VPlanPtr &Plan : VPlans) {
7508     if (Plan->hasVF(VF))
7509       return *Plan.get();
7510   }
7511   llvm_unreachable("No plan found!");
7512 }
7513 
7514 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
7515   SmallVector<Metadata *, 4> MDs;
7516   // Reserve first location for self reference to the LoopID metadata node.
7517   MDs.push_back(nullptr);
7518   bool IsUnrollMetadata = false;
7519   MDNode *LoopID = L->getLoopID();
7520   if (LoopID) {
7521     // First find existing loop unrolling disable metadata.
7522     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
7523       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
7524       if (MD) {
7525         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
7526         IsUnrollMetadata =
7527             S && S->getString().startswith("llvm.loop.unroll.disable");
7528       }
7529       MDs.push_back(LoopID->getOperand(i));
7530     }
7531   }
7532 
7533   if (!IsUnrollMetadata) {
7534     // Add runtime unroll disable metadata.
7535     LLVMContext &Context = L->getHeader()->getContext();
7536     SmallVector<Metadata *, 1> DisableOperands;
7537     DisableOperands.push_back(
7538         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
7539     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
7540     MDs.push_back(DisableNode);
7541     MDNode *NewLoopID = MDNode::get(Context, MDs);
7542     // Set operand 0 to refer to the loop id itself.
7543     NewLoopID->replaceOperandWith(0, NewLoopID);
7544     L->setLoopID(NewLoopID);
7545   }
7546 }
7547 
7548 void LoopVectorizationPlanner::executePlan(ElementCount BestVF, unsigned BestUF,
7549                                            VPlan &BestVPlan,
7550                                            InnerLoopVectorizer &ILV,
7551                                            DominatorTree *DT,
7552                                            bool IsEpilogueVectorization) {
7553   LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
7554                     << '\n');
7555 
7556   // Perform the actual loop transformation.
7557 
7558   // 1. Set up the skeleton for vectorization, including vector pre-header and
7559   // middle block. The vector loop is created during VPlan execution.
7560   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
7561   Value *CanonicalIVStartValue;
7562   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
7563       ILV.createVectorizedLoopSkeleton();
7564 
7565   // Only use noalias metadata when using memory checks guaranteeing no overlap
7566   // across all iterations.
7567   const LoopAccessInfo *LAI = ILV.Legal->getLAI();
7568   if (LAI && !LAI->getRuntimePointerChecking()->getChecks().empty() &&
7569       !LAI->getRuntimePointerChecking()->getDiffChecks()) {
7570 
7571     //  We currently don't use LoopVersioning for the actual loop cloning but we
7572     //  still use it to add the noalias metadata.
7573     //  TODO: Find a better way to re-use LoopVersioning functionality to add
7574     //        metadata.
7575     State.LVer = std::make_unique<LoopVersioning>(
7576         *LAI, LAI->getRuntimePointerChecking()->getChecks(), OrigLoop, LI, DT,
7577         PSE.getSE());
7578     State.LVer->prepareNoAliasMetadata();
7579   }
7580 
7581   ILV.collectPoisonGeneratingRecipes(State);
7582 
7583   ILV.printDebugTracesAtStart();
7584 
7585   //===------------------------------------------------===//
7586   //
7587   // Notice: any optimization or new instruction that go
7588   // into the code below should also be implemented in
7589   // the cost-model.
7590   //
7591   //===------------------------------------------------===//
7592 
7593   // 2. Copy and widen instructions from the old loop into the new loop.
7594   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
7595                              ILV.getOrCreateVectorTripCount(nullptr),
7596                              CanonicalIVStartValue, State,
7597                              IsEpilogueVectorization);
7598 
7599   BestVPlan.execute(&State);
7600 
7601   // Keep all loop hints from the original loop on the vector loop (we'll
7602   // replace the vectorizer-specific hints below).
7603   MDNode *OrigLoopID = OrigLoop->getLoopID();
7604 
7605   Optional<MDNode *> VectorizedLoopID =
7606       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7607                                       LLVMLoopVectorizeFollowupVectorized});
7608 
7609   VPBasicBlock *HeaderVPBB =
7610       BestVPlan.getVectorLoopRegion()->getEntryBasicBlock();
7611   Loop *L = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7612   if (VectorizedLoopID)
7613     L->setLoopID(VectorizedLoopID.value());
7614   else {
7615     // Keep all loop hints from the original loop on the vector loop (we'll
7616     // replace the vectorizer-specific hints below).
7617     if (MDNode *LID = OrigLoop->getLoopID())
7618       L->setLoopID(LID);
7619 
7620     LoopVectorizeHints Hints(L, true, *ORE);
7621     Hints.setAlreadyVectorized();
7622   }
7623   // Disable runtime unrolling when vectorizing the epilogue loop.
7624   if (CanonicalIVStartValue)
7625     AddRuntimeUnrollDisableMetaData(L);
7626 
7627   // 3. Fix the vectorized code: take care of header phi's, live-outs,
7628   //    predication, updating analyses.
7629   ILV.fixVectorizedLoop(State, BestVPlan);
7630 
7631   ILV.printDebugTracesAtEnd();
7632 }
7633 
7634 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
7635 void LoopVectorizationPlanner::printPlans(raw_ostream &O) {
7636   for (const auto &Plan : VPlans)
7637     if (PrintVPlansInDotFormat)
7638       Plan->printDOT(O);
7639     else
7640       Plan->print(O);
7641 }
7642 #endif
7643 
7644 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
7645 
7646 //===--------------------------------------------------------------------===//
7647 // EpilogueVectorizerMainLoop
7648 //===--------------------------------------------------------------------===//
7649 
7650 /// This function is partially responsible for generating the control flow
7651 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7652 std::pair<BasicBlock *, Value *>
7653 EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7654   MDNode *OrigLoopID = OrigLoop->getLoopID();
7655 
7656   // Workaround!  Compute the trip count of the original loop and cache it
7657   // before we start modifying the CFG.  This code has a systemic problem
7658   // wherein it tries to run analysis over partially constructed IR; this is
7659   // wrong, and not simply for SCEV.  The trip count of the original loop
7660   // simply happens to be prone to hitting this in practice.  In theory, we
7661   // can hit the same issue for any SCEV, or ValueTracking query done during
7662   // mutation.  See PR49900.
7663   getOrCreateTripCount(OrigLoop->getLoopPreheader());
7664   createVectorLoopSkeleton("");
7665 
7666   // Generate the code to check the minimum iteration count of the vector
7667   // epilogue (see below).
7668   EPI.EpilogueIterationCountCheck =
7669       emitIterationCountCheck(LoopScalarPreHeader, true);
7670   EPI.EpilogueIterationCountCheck->setName("iter.check");
7671 
7672   // Generate the code to check any assumptions that we've made for SCEV
7673   // expressions.
7674   EPI.SCEVSafetyCheck = emitSCEVChecks(LoopScalarPreHeader);
7675 
7676   // Generate the code that checks at runtime if arrays overlap. We put the
7677   // checks into a separate block to make the more common case of few elements
7678   // faster.
7679   EPI.MemSafetyCheck = emitMemRuntimeChecks(LoopScalarPreHeader);
7680 
7681   // Generate the iteration count check for the main loop, *after* the check
7682   // for the epilogue loop, so that the path-length is shorter for the case
7683   // that goes directly through the vector epilogue. The longer-path length for
7684   // the main loop is compensated for, by the gain from vectorizing the larger
7685   // trip count. Note: the branch will get updated later on when we vectorize
7686   // the epilogue.
7687   EPI.MainLoopIterationCountCheck =
7688       emitIterationCountCheck(LoopScalarPreHeader, false);
7689 
7690   // Generate the induction variable.
7691   EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
7692 
7693   // Skip induction resume value creation here because they will be created in
7694   // the second pass. If we created them here, they wouldn't be used anyway,
7695   // because the vplan in the second pass still contains the inductions from the
7696   // original loop.
7697 
7698   return {completeLoopSkeleton(OrigLoopID), nullptr};
7699 }
7700 
7701 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7702   LLVM_DEBUG({
7703     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7704            << "Main Loop VF:" << EPI.MainLoopVF
7705            << ", Main Loop UF:" << EPI.MainLoopUF
7706            << ", Epilogue Loop VF:" << EPI.EpilogueVF
7707            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7708   });
7709 }
7710 
7711 void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7712   DEBUG_WITH_TYPE(VerboseDebug, {
7713     dbgs() << "intermediate fn:\n"
7714            << *OrigLoop->getHeader()->getParent() << "\n";
7715   });
7716 }
7717 
7718 BasicBlock *
7719 EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7720                                                     bool ForEpilogue) {
7721   assert(Bypass && "Expected valid bypass basic block.");
7722   ElementCount VFactor = ForEpilogue ? EPI.EpilogueVF : VF;
7723   unsigned UFactor = ForEpilogue ? EPI.EpilogueUF : UF;
7724   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
7725   // Reuse existing vector loop preheader for TC checks.
7726   // Note that new preheader block is generated for vector loop.
7727   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7728   IRBuilder<> Builder(TCCheckBlock->getTerminator());
7729 
7730   // Generate code to check if the loop's trip count is less than VF * UF of the
7731   // main vector loop.
7732   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
7733       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7734 
7735   Value *CheckMinIters = Builder.CreateICmp(
7736       P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
7737       "min.iters.check");
7738 
7739   if (!ForEpilogue)
7740     TCCheckBlock->setName("vector.main.loop.iter.check");
7741 
7742   // Create new preheader for vector loop.
7743   LoopVectorPreHeader = SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(),
7744                                    DT, LI, nullptr, "vector.ph");
7745 
7746   if (ForEpilogue) {
7747     assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
7748                                  DT->getNode(Bypass)->getIDom()) &&
7749            "TC check is expected to dominate Bypass");
7750 
7751     // Update dominator for Bypass & LoopExit.
7752     DT->changeImmediateDominator(Bypass, TCCheckBlock);
7753     if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7754       // For loops with multiple exits, there's no edge from the middle block
7755       // to exit blocks (as the epilogue must run) and thus no need to update
7756       // the immediate dominator of the exit blocks.
7757       DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
7758 
7759     LoopBypassBlocks.push_back(TCCheckBlock);
7760 
7761     // Save the trip count so we don't have to regenerate it in the
7762     // vec.epilog.iter.check. This is safe to do because the trip count
7763     // generated here dominates the vector epilog iter check.
7764     EPI.TripCount = Count;
7765   }
7766 
7767   ReplaceInstWithInst(
7768       TCCheckBlock->getTerminator(),
7769       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7770 
7771   return TCCheckBlock;
7772 }
7773 
7774 //===--------------------------------------------------------------------===//
7775 // EpilogueVectorizerEpilogueLoop
7776 //===--------------------------------------------------------------------===//
7777 
7778 /// This function is partially responsible for generating the control flow
7779 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7780 std::pair<BasicBlock *, Value *>
7781 EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7782   MDNode *OrigLoopID = OrigLoop->getLoopID();
7783   createVectorLoopSkeleton("vec.epilog.");
7784 
7785   // Now, compare the remaining count and if there aren't enough iterations to
7786   // execute the vectorized epilogue skip to the scalar part.
7787   BasicBlock *VecEpilogueIterationCountCheck = LoopVectorPreHeader;
7788   VecEpilogueIterationCountCheck->setName("vec.epilog.iter.check");
7789   LoopVectorPreHeader =
7790       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
7791                  LI, nullptr, "vec.epilog.ph");
7792   emitMinimumVectorEpilogueIterCountCheck(LoopScalarPreHeader,
7793                                           VecEpilogueIterationCountCheck);
7794 
7795   // Adjust the control flow taking the state info from the main loop
7796   // vectorization into account.
7797   assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7798          "expected this to be saved from the previous pass.");
7799   EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7800       VecEpilogueIterationCountCheck, LoopVectorPreHeader);
7801 
7802   DT->changeImmediateDominator(LoopVectorPreHeader,
7803                                EPI.MainLoopIterationCountCheck);
7804 
7805   EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7806       VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7807 
7808   if (EPI.SCEVSafetyCheck)
7809     EPI.SCEVSafetyCheck->getTerminator()->replaceUsesOfWith(
7810         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7811   if (EPI.MemSafetyCheck)
7812     EPI.MemSafetyCheck->getTerminator()->replaceUsesOfWith(
7813         VecEpilogueIterationCountCheck, LoopScalarPreHeader);
7814 
7815   DT->changeImmediateDominator(
7816       VecEpilogueIterationCountCheck,
7817       VecEpilogueIterationCountCheck->getSinglePredecessor());
7818 
7819   DT->changeImmediateDominator(LoopScalarPreHeader,
7820                                EPI.EpilogueIterationCountCheck);
7821   if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
7822     // If there is an epilogue which must run, there's no edge from the
7823     // middle block to exit blocks  and thus no need to update the immediate
7824     // dominator of the exit blocks.
7825     DT->changeImmediateDominator(LoopExitBlock,
7826                                  EPI.EpilogueIterationCountCheck);
7827 
7828   // Keep track of bypass blocks, as they feed start values to the induction
7829   // phis in the scalar loop preheader.
7830   if (EPI.SCEVSafetyCheck)
7831     LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7832   if (EPI.MemSafetyCheck)
7833     LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
7834   LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
7835 
7836   // The vec.epilog.iter.check block may contain Phi nodes from reductions which
7837   // merge control-flow from the latch block and the middle block. Update the
7838   // incoming values here and move the Phi into the preheader.
7839   SmallVector<PHINode *, 4> PhisInBlock;
7840   for (PHINode &Phi : VecEpilogueIterationCountCheck->phis())
7841     PhisInBlock.push_back(&Phi);
7842 
7843   for (PHINode *Phi : PhisInBlock) {
7844     Phi->replaceIncomingBlockWith(
7845         VecEpilogueIterationCountCheck->getSinglePredecessor(),
7846         VecEpilogueIterationCountCheck);
7847     Phi->removeIncomingValue(EPI.EpilogueIterationCountCheck);
7848     if (EPI.SCEVSafetyCheck)
7849       Phi->removeIncomingValue(EPI.SCEVSafetyCheck);
7850     if (EPI.MemSafetyCheck)
7851       Phi->removeIncomingValue(EPI.MemSafetyCheck);
7852     Phi->moveBefore(LoopVectorPreHeader->getFirstNonPHI());
7853   }
7854 
7855   // Generate a resume induction for the vector epilogue and put it in the
7856   // vector epilogue preheader
7857   Type *IdxTy = Legal->getWidestInductionType();
7858   PHINode *EPResumeVal = PHINode::Create(IdxTy, 2, "vec.epilog.resume.val",
7859                                          LoopVectorPreHeader->getFirstNonPHI());
7860   EPResumeVal->addIncoming(EPI.VectorTripCount, VecEpilogueIterationCountCheck);
7861   EPResumeVal->addIncoming(ConstantInt::get(IdxTy, 0),
7862                            EPI.MainLoopIterationCountCheck);
7863 
7864   // Generate induction resume values. These variables save the new starting
7865   // indexes for the scalar loop. They are used to test if there are any tail
7866   // iterations left once the vector loop has completed.
7867   // Note that when the vectorized epilogue is skipped due to iteration count
7868   // check, then the resume value for the induction variable comes from
7869   // the trip count of the main vector loop, hence passing the AdditionalBypass
7870   // argument.
7871   createInductionResumeValues({VecEpilogueIterationCountCheck,
7872                                EPI.VectorTripCount} /* AdditionalBypass */);
7873 
7874   return {completeLoopSkeleton(OrigLoopID), EPResumeVal};
7875 }
7876 
7877 BasicBlock *
7878 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7879     BasicBlock *Bypass, BasicBlock *Insert) {
7880 
7881   assert(EPI.TripCount &&
7882          "Expected trip count to have been safed in the first pass.");
7883   assert(
7884       (!isa<Instruction>(EPI.TripCount) ||
7885        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
7886       "saved trip count does not dominate insertion point.");
7887   Value *TC = EPI.TripCount;
7888   IRBuilder<> Builder(Insert->getTerminator());
7889   Value *Count = Builder.CreateSub(TC, EPI.VectorTripCount, "n.vec.remaining");
7890 
7891   // Generate code to check if the loop's trip count is less than VF * UF of the
7892   // vector epilogue loop.
7893   auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ?
7894       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
7895 
7896   Value *CheckMinIters =
7897       Builder.CreateICmp(P, Count,
7898                          createStepForVF(Builder, Count->getType(),
7899                                          EPI.EpilogueVF, EPI.EpilogueUF),
7900                          "min.epilog.iters.check");
7901 
7902   ReplaceInstWithInst(
7903       Insert->getTerminator(),
7904       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
7905 
7906   LoopBypassBlocks.push_back(Insert);
7907   return Insert;
7908 }
7909 
7910 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7911   LLVM_DEBUG({
7912     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7913            << "Epilogue Loop VF:" << EPI.EpilogueVF
7914            << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7915   });
7916 }
7917 
7918 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7919   DEBUG_WITH_TYPE(VerboseDebug, {
7920     dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7921   });
7922 }
7923 
7924 bool LoopVectorizationPlanner::getDecisionAndClampRange(
7925     const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
7926   assert(!Range.isEmpty() && "Trying to test an empty VF range.");
7927   bool PredicateAtRangeStart = Predicate(Range.Start);
7928 
7929   for (ElementCount TmpVF = Range.Start * 2;
7930        ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
7931     if (Predicate(TmpVF) != PredicateAtRangeStart) {
7932       Range.End = TmpVF;
7933       break;
7934     }
7935 
7936   return PredicateAtRangeStart;
7937 }
7938 
7939 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
7940 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
7941 /// of VF's starting at a given VF and extending it as much as possible. Each
7942 /// vectorization decision can potentially shorten this sub-range during
7943 /// buildVPlan().
7944 void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
7945                                            ElementCount MaxVF) {
7946   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
7947   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
7948     VFRange SubRange = {VF, MaxVFPlusOne};
7949     VPlans.push_back(buildVPlan(SubRange));
7950     VF = SubRange.End;
7951   }
7952 }
7953 
7954 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
7955                                          VPlanPtr &Plan) {
7956   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
7957 
7958   // Look for cached value.
7959   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
7960   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
7961   if (ECEntryIt != EdgeMaskCache.end())
7962     return ECEntryIt->second;
7963 
7964   VPValue *SrcMask = createBlockInMask(Src, Plan);
7965 
7966   // The terminator has to be a branch inst!
7967   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
7968   assert(BI && "Unexpected terminator found");
7969 
7970   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
7971     return EdgeMaskCache[Edge] = SrcMask;
7972 
7973   // If source is an exiting block, we know the exit edge is dynamically dead
7974   // in the vector loop, and thus we don't need to restrict the mask.  Avoid
7975   // adding uses of an otherwise potentially dead instruction.
7976   if (OrigLoop->isLoopExiting(Src))
7977     return EdgeMaskCache[Edge] = SrcMask;
7978 
7979   VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
7980   assert(EdgeMask && "No Edge Mask found for condition");
7981 
7982   if (BI->getSuccessor(0) != Dst)
7983     EdgeMask = Builder.createNot(EdgeMask, BI->getDebugLoc());
7984 
7985   if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
7986     // The condition is 'SrcMask && EdgeMask', which is equivalent to
7987     // 'select i1 SrcMask, i1 EdgeMask, i1 false'.
7988     // The select version does not introduce new UB if SrcMask is false and
7989     // EdgeMask is poison. Using 'and' here introduces undefined behavior.
7990     VPValue *False = Plan->getOrAddVPValue(
7991         ConstantInt::getFalse(BI->getCondition()->getType()));
7992     EdgeMask =
7993         Builder.createSelect(SrcMask, EdgeMask, False, BI->getDebugLoc());
7994   }
7995 
7996   return EdgeMaskCache[Edge] = EdgeMask;
7997 }
7998 
7999 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
8000   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
8001 
8002   // Look for cached value.
8003   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
8004   if (BCEntryIt != BlockMaskCache.end())
8005     return BCEntryIt->second;
8006 
8007   // All-one mask is modelled as no-mask following the convention for masked
8008   // load/store/gather/scatter. Initialize BlockMask to no-mask.
8009   VPValue *BlockMask = nullptr;
8010 
8011   if (OrigLoop->getHeader() == BB) {
8012     if (!CM.blockNeedsPredicationForAnyReason(BB))
8013       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
8014 
8015     assert(CM.foldTailByMasking() && "must fold the tail");
8016 
8017     // If we're using the active lane mask for control flow, then we get the
8018     // mask from the active lane mask PHI that is cached in the VPlan.
8019     PredicationStyle EmitGetActiveLaneMask = CM.TTI.emitGetActiveLaneMask();
8020     if (EmitGetActiveLaneMask == PredicationStyle::DataAndControlFlow)
8021       return BlockMaskCache[BB] = Plan->getActiveLaneMaskPhi();
8022 
8023     // Introduce the early-exit compare IV <= BTC to form header block mask.
8024     // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by
8025     // constructing the desired canonical IV in the header block as its first
8026     // non-phi instructions.
8027 
8028     VPBasicBlock *HeaderVPBB =
8029         Plan->getVectorLoopRegion()->getEntryBasicBlock();
8030     auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
8031     auto *IV = new VPWidenCanonicalIVRecipe(Plan->getCanonicalIV());
8032     HeaderVPBB->insert(IV, HeaderVPBB->getFirstNonPhi());
8033 
8034     VPBuilder::InsertPointGuard Guard(Builder);
8035     Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8036     if (EmitGetActiveLaneMask != PredicationStyle::None) {
8037       VPValue *TC = Plan->getOrCreateTripCount();
8038       BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, TC},
8039                                        nullptr, "active.lane.mask");
8040     } else {
8041       VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
8042       BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
8043     }
8044     return BlockMaskCache[BB] = BlockMask;
8045   }
8046 
8047   // This is the block mask. We OR all incoming edges.
8048   for (auto *Predecessor : predecessors(BB)) {
8049     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
8050     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
8051       return BlockMaskCache[BB] = EdgeMask;
8052 
8053     if (!BlockMask) { // BlockMask has its initialized nullptr value.
8054       BlockMask = EdgeMask;
8055       continue;
8056     }
8057 
8058     BlockMask = Builder.createOr(BlockMask, EdgeMask, {});
8059   }
8060 
8061   return BlockMaskCache[BB] = BlockMask;
8062 }
8063 
8064 VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I,
8065                                                 ArrayRef<VPValue *> Operands,
8066                                                 VFRange &Range,
8067                                                 VPlanPtr &Plan) {
8068   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
8069          "Must be called with either a load or store");
8070 
8071   auto willWiden = [&](ElementCount VF) -> bool {
8072     LoopVectorizationCostModel::InstWidening Decision =
8073         CM.getWideningDecision(I, VF);
8074     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
8075            "CM decision should be taken at this point.");
8076     if (Decision == LoopVectorizationCostModel::CM_Interleave)
8077       return true;
8078     if (CM.isScalarAfterVectorization(I, VF) ||
8079         CM.isProfitableToScalarize(I, VF))
8080       return false;
8081     return Decision != LoopVectorizationCostModel::CM_Scalarize;
8082   };
8083 
8084   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8085     return nullptr;
8086 
8087   VPValue *Mask = nullptr;
8088   if (Legal->isMaskRequired(I))
8089     Mask = createBlockInMask(I->getParent(), Plan);
8090 
8091   // Determine if the pointer operand of the access is either consecutive or
8092   // reverse consecutive.
8093   LoopVectorizationCostModel::InstWidening Decision =
8094       CM.getWideningDecision(I, Range.Start);
8095   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
8096   bool Consecutive =
8097       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
8098 
8099   if (LoadInst *Load = dyn_cast<LoadInst>(I))
8100     return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask,
8101                                               Consecutive, Reverse);
8102 
8103   StoreInst *Store = cast<StoreInst>(I);
8104   return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0],
8105                                             Mask, Consecutive, Reverse);
8106 }
8107 
8108 /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
8109 /// insert a recipe to expand the step for the induction recipe.
8110 static VPWidenIntOrFpInductionRecipe *createWidenInductionRecipes(
8111     PHINode *Phi, Instruction *PhiOrTrunc, VPValue *Start,
8112     const InductionDescriptor &IndDesc, LoopVectorizationCostModel &CM,
8113     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
8114   // Returns true if an instruction \p I should be scalarized instead of
8115   // vectorized for the chosen vectorization factor.
8116   auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
8117     return CM.isScalarAfterVectorization(I, VF) ||
8118            CM.isProfitableToScalarize(I, VF);
8119   };
8120 
8121   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
8122       [&](ElementCount VF) {
8123         return ShouldScalarizeInstruction(PhiOrTrunc, VF);
8124       },
8125       Range);
8126   assert(IndDesc.getStartValue() ==
8127          Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
8128   assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
8129          "step must be loop invariant");
8130 
8131   VPValue *Step =
8132       vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8133   if (auto *TruncI = dyn_cast<TruncInst>(PhiOrTrunc)) {
8134     return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc, TruncI,
8135                                              !NeedsScalarIVOnly);
8136   }
8137   assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
8138   return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, IndDesc,
8139                                            !NeedsScalarIVOnly);
8140 }
8141 
8142 VPRecipeBase *VPRecipeBuilder::tryToOptimizeInductionPHI(
8143     PHINode *Phi, ArrayRef<VPValue *> Operands, VPlan &Plan, VFRange &Range) {
8144 
8145   // Check if this is an integer or fp induction. If so, build the recipe that
8146   // produces its scalar and vector values.
8147   if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
8148     return createWidenInductionRecipes(Phi, Phi, Operands[0], *II, CM, Plan,
8149                                        *PSE.getSE(), *OrigLoop, Range);
8150 
8151   // Check if this is pointer induction. If so, build the recipe for it.
8152   if (auto *II = Legal->getPointerInductionDescriptor(Phi))
8153     return new VPWidenPointerInductionRecipe(Phi, Operands[0], *II,
8154                                              *PSE.getSE());
8155   return nullptr;
8156 }
8157 
8158 VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8159     TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8160   // Optimize the special case where the source is a constant integer
8161   // induction variable. Notice that we can only optimize the 'trunc' case
8162   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8163   // (c) other casts depend on pointer size.
8164 
8165   // Determine whether \p K is a truncation based on an induction variable that
8166   // can be optimized.
8167   auto isOptimizableIVTruncate =
8168       [&](Instruction *K) -> std::function<bool(ElementCount)> {
8169     return [=](ElementCount VF) -> bool {
8170       return CM.isOptimizableIVTruncate(K, VF);
8171     };
8172   };
8173 
8174   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8175           isOptimizableIVTruncate(I), Range)) {
8176 
8177     auto *Phi = cast<PHINode>(I->getOperand(0));
8178     const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8179     VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
8180     return createWidenInductionRecipes(Phi, I, Start, II, CM, Plan,
8181                                        *PSE.getSE(), *OrigLoop, Range);
8182   }
8183   return nullptr;
8184 }
8185 
8186 VPRecipeOrVPValueTy VPRecipeBuilder::tryToBlend(PHINode *Phi,
8187                                                 ArrayRef<VPValue *> Operands,
8188                                                 VPlanPtr &Plan) {
8189   // If all incoming values are equal, the incoming VPValue can be used directly
8190   // instead of creating a new VPBlendRecipe.
8191   VPValue *FirstIncoming = Operands[0];
8192   if (all_of(Operands, [FirstIncoming](const VPValue *Inc) {
8193         return FirstIncoming == Inc;
8194       })) {
8195     return Operands[0];
8196   }
8197 
8198   unsigned NumIncoming = Phi->getNumIncomingValues();
8199   // For in-loop reductions, we do not need to create an additional select.
8200   VPValue *InLoopVal = nullptr;
8201   for (unsigned In = 0; In < NumIncoming; In++) {
8202     PHINode *PhiOp =
8203         dyn_cast_or_null<PHINode>(Operands[In]->getUnderlyingValue());
8204     if (PhiOp && CM.isInLoopReduction(PhiOp)) {
8205       assert(!InLoopVal && "Found more than one in-loop reduction!");
8206       InLoopVal = Operands[In];
8207     }
8208   }
8209 
8210   assert((!InLoopVal || NumIncoming == 2) &&
8211          "Found an in-loop reduction for PHI with unexpected number of "
8212          "incoming values");
8213   if (InLoopVal)
8214     return Operands[Operands[0] == InLoopVal ? 1 : 0];
8215 
8216   // We know that all PHIs in non-header blocks are converted into selects, so
8217   // we don't have to worry about the insertion order and we can just use the
8218   // builder. At this point we generate the predication tree. There may be
8219   // duplications since this is a simple recursive scan, but future
8220   // optimizations will clean it up.
8221   SmallVector<VPValue *, 2> OperandsWithMask;
8222 
8223   for (unsigned In = 0; In < NumIncoming; In++) {
8224     VPValue *EdgeMask =
8225       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
8226     assert((EdgeMask || NumIncoming == 1) &&
8227            "Multiple predecessors with one having a full mask");
8228     OperandsWithMask.push_back(Operands[In]);
8229     if (EdgeMask)
8230       OperandsWithMask.push_back(EdgeMask);
8231   }
8232   return toVPRecipeResult(new VPBlendRecipe(Phi, OperandsWithMask));
8233 }
8234 
8235 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
8236                                                    ArrayRef<VPValue *> Operands,
8237                                                    VFRange &Range) const {
8238 
8239   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8240       [this, CI](ElementCount VF) {
8241         return CM.isScalarWithPredication(CI, VF);
8242       },
8243       Range);
8244 
8245   if (IsPredicated)
8246     return nullptr;
8247 
8248   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8249   if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
8250              ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
8251              ID == Intrinsic::pseudoprobe ||
8252              ID == Intrinsic::experimental_noalias_scope_decl))
8253     return nullptr;
8254 
8255   auto willWiden = [&](ElementCount VF) -> bool {
8256     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
8257     // The following case may be scalarized depending on the VF.
8258     // The flag shows whether we use Intrinsic or a usual Call for vectorized
8259     // version of the instruction.
8260     // Is it beneficial to perform intrinsic call compared to lib call?
8261     bool NeedToScalarize = false;
8262     InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
8263     InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
8264     bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
8265     return UseVectorIntrinsic || !NeedToScalarize;
8266   };
8267 
8268   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
8269     return nullptr;
8270 
8271   ArrayRef<VPValue *> Ops = Operands.take_front(CI->arg_size());
8272   return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()));
8273 }
8274 
8275 bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
8276   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
8277          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
8278   // Instruction should be widened, unless it is scalar after vectorization,
8279   // scalarization is profitable or it is predicated.
8280   auto WillScalarize = [this, I](ElementCount VF) -> bool {
8281     return CM.isScalarAfterVectorization(I, VF) ||
8282            CM.isProfitableToScalarize(I, VF) ||
8283            CM.isScalarWithPredication(I, VF);
8284   };
8285   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
8286                                                              Range);
8287 }
8288 
8289 VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
8290                                            ArrayRef<VPValue *> Operands) const {
8291   auto IsVectorizableOpcode = [](unsigned Opcode) {
8292     switch (Opcode) {
8293     case Instruction::Add:
8294     case Instruction::And:
8295     case Instruction::AShr:
8296     case Instruction::BitCast:
8297     case Instruction::FAdd:
8298     case Instruction::FCmp:
8299     case Instruction::FDiv:
8300     case Instruction::FMul:
8301     case Instruction::FNeg:
8302     case Instruction::FPExt:
8303     case Instruction::FPToSI:
8304     case Instruction::FPToUI:
8305     case Instruction::FPTrunc:
8306     case Instruction::FRem:
8307     case Instruction::FSub:
8308     case Instruction::ICmp:
8309     case Instruction::IntToPtr:
8310     case Instruction::LShr:
8311     case Instruction::Mul:
8312     case Instruction::Or:
8313     case Instruction::PtrToInt:
8314     case Instruction::SDiv:
8315     case Instruction::Select:
8316     case Instruction::SExt:
8317     case Instruction::Shl:
8318     case Instruction::SIToFP:
8319     case Instruction::SRem:
8320     case Instruction::Sub:
8321     case Instruction::Trunc:
8322     case Instruction::UDiv:
8323     case Instruction::UIToFP:
8324     case Instruction::URem:
8325     case Instruction::Xor:
8326     case Instruction::ZExt:
8327     case Instruction::Freeze:
8328       return true;
8329     }
8330     return false;
8331   };
8332 
8333   if (!IsVectorizableOpcode(I->getOpcode()))
8334     return nullptr;
8335 
8336   // Success: widen this instruction.
8337   return new VPWidenRecipe(*I, make_range(Operands.begin(), Operands.end()));
8338 }
8339 
8340 void VPRecipeBuilder::fixHeaderPhis() {
8341   BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
8342   for (VPHeaderPHIRecipe *R : PhisToFix) {
8343     auto *PN = cast<PHINode>(R->getUnderlyingValue());
8344     VPRecipeBase *IncR =
8345         getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
8346     R->addOperand(IncR->getVPSingleValue());
8347   }
8348 }
8349 
8350 VPBasicBlock *VPRecipeBuilder::handleReplication(
8351     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
8352     VPlanPtr &Plan) {
8353   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
8354       [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
8355       Range);
8356 
8357   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
8358       [&](ElementCount VF) { return CM.isPredicatedInst(I, VF); },
8359       Range);
8360 
8361   // Even if the instruction is not marked as uniform, there are certain
8362   // intrinsic calls that can be effectively treated as such, so we check for
8363   // them here. Conservatively, we only do this for scalable vectors, since
8364   // for fixed-width VFs we can always fall back on full scalarization.
8365   if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(I)) {
8366     switch (cast<IntrinsicInst>(I)->getIntrinsicID()) {
8367     case Intrinsic::assume:
8368     case Intrinsic::lifetime_start:
8369     case Intrinsic::lifetime_end:
8370       // For scalable vectors if one of the operands is variant then we still
8371       // want to mark as uniform, which will generate one instruction for just
8372       // the first lane of the vector. We can't scalarize the call in the same
8373       // way as for fixed-width vectors because we don't know how many lanes
8374       // there are.
8375       //
8376       // The reasons for doing it this way for scalable vectors are:
8377       //   1. For the assume intrinsic generating the instruction for the first
8378       //      lane is still be better than not generating any at all. For
8379       //      example, the input may be a splat across all lanes.
8380       //   2. For the lifetime start/end intrinsics the pointer operand only
8381       //      does anything useful when the input comes from a stack object,
8382       //      which suggests it should always be uniform. For non-stack objects
8383       //      the effect is to poison the object, which still allows us to
8384       //      remove the call.
8385       IsUniform = true;
8386       break;
8387     default:
8388       break;
8389     }
8390   }
8391 
8392   auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
8393                                        IsUniform, IsPredicated);
8394 
8395   // Find if I uses a predicated instruction. If so, it will use its scalar
8396   // value. Avoid hoisting the insert-element which packs the scalar value into
8397   // a vector value, as that happens iff all users use the vector value.
8398   for (VPValue *Op : Recipe->operands()) {
8399     auto *PredR = dyn_cast_or_null<VPPredInstPHIRecipe>(Op->getDef());
8400     if (!PredR)
8401       continue;
8402     auto *RepR =
8403         cast_or_null<VPReplicateRecipe>(PredR->getOperand(0)->getDef());
8404     assert(RepR->isPredicated() &&
8405            "expected Replicate recipe to be predicated");
8406     RepR->setAlsoPack(false);
8407   }
8408 
8409   // Finalize the recipe for Instr, first if it is not predicated.
8410   if (!IsPredicated) {
8411     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8412     setRecipe(I, Recipe);
8413     Plan->addVPValue(I, Recipe);
8414     VPBB->appendRecipe(Recipe);
8415     return VPBB;
8416   }
8417   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8418 
8419   VPBlockBase *SingleSucc = VPBB->getSingleSuccessor();
8420   assert(SingleSucc && "VPBB must have a single successor when handling "
8421                        "predicated replication.");
8422   VPBlockUtils::disconnectBlocks(VPBB, SingleSucc);
8423   // Record predicated instructions for above packing optimizations.
8424   VPBlockBase *Region = createReplicateRegion(Recipe, Plan);
8425   VPBlockUtils::insertBlockAfter(Region, VPBB);
8426   auto *RegSucc = new VPBasicBlock();
8427   VPBlockUtils::insertBlockAfter(RegSucc, Region);
8428   VPBlockUtils::connectBlocks(RegSucc, SingleSucc);
8429   return RegSucc;
8430 }
8431 
8432 VPRegionBlock *
8433 VPRecipeBuilder::createReplicateRegion(VPReplicateRecipe *PredRecipe,
8434                                        VPlanPtr &Plan) {
8435   Instruction *Instr = PredRecipe->getUnderlyingInstr();
8436   // Instructions marked for predication are replicated and placed under an
8437   // if-then construct to prevent side-effects.
8438   // Generate recipes to compute the block mask for this region.
8439   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
8440 
8441   // Build the triangular if-then region.
8442   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
8443   assert(Instr->getParent() && "Predicated instruction not in any basic block");
8444   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
8445   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
8446   auto *PHIRecipe = Instr->getType()->isVoidTy()
8447                         ? nullptr
8448                         : new VPPredInstPHIRecipe(PredRecipe);
8449   if (PHIRecipe) {
8450     setRecipe(Instr, PHIRecipe);
8451     Plan->addVPValue(Instr, PHIRecipe);
8452   } else {
8453     setRecipe(Instr, PredRecipe);
8454     Plan->addVPValue(Instr, PredRecipe);
8455   }
8456 
8457   auto *Exiting = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
8458   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
8459   VPRegionBlock *Region = new VPRegionBlock(Entry, Exiting, RegionName, true);
8460 
8461   // Note: first set Entry as region entry and then connect successors starting
8462   // from it in order, to propagate the "parent" of each VPBasicBlock.
8463   VPBlockUtils::insertTwoBlocksAfter(Pred, Exiting, Entry);
8464   VPBlockUtils::connectBlocks(Pred, Exiting);
8465 
8466   return Region;
8467 }
8468 
8469 VPRecipeOrVPValueTy
8470 VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8471                                         ArrayRef<VPValue *> Operands,
8472                                         VFRange &Range, VPlanPtr &Plan) {
8473   // First, check for specific widening recipes that deal with inductions, Phi
8474   // nodes, calls and memory operations.
8475   VPRecipeBase *Recipe;
8476   if (auto Phi = dyn_cast<PHINode>(Instr)) {
8477     if (Phi->getParent() != OrigLoop->getHeader())
8478       return tryToBlend(Phi, Operands, Plan);
8479     if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8480       return toVPRecipeResult(Recipe);
8481 
8482     VPHeaderPHIRecipe *PhiRecipe = nullptr;
8483     assert((Legal->isReductionVariable(Phi) ||
8484             Legal->isFirstOrderRecurrence(Phi)) &&
8485            "can only widen reductions and first-order recurrences here");
8486     VPValue *StartV = Operands[0];
8487     if (Legal->isReductionVariable(Phi)) {
8488       const RecurrenceDescriptor &RdxDesc =
8489           Legal->getReductionVars().find(Phi)->second;
8490       assert(RdxDesc.getRecurrenceStartValue() ==
8491              Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8492       PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
8493                                            CM.isInLoopReduction(Phi),
8494                                            CM.useOrderedReductions(RdxDesc));
8495     } else {
8496       PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8497     }
8498 
8499     // Record the incoming value from the backedge, so we can add the incoming
8500     // value from the backedge after all recipes have been created.
8501     recordRecipeOf(cast<Instruction>(
8502         Phi->getIncomingValueForBlock(OrigLoop->getLoopLatch())));
8503     PhisToFix.push_back(PhiRecipe);
8504     return toVPRecipeResult(PhiRecipe);
8505   }
8506 
8507   if (isa<TruncInst>(Instr) &&
8508       (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8509                                                Range, *Plan)))
8510     return toVPRecipeResult(Recipe);
8511 
8512   // All widen recipes below deal only with VF > 1.
8513   if (LoopVectorizationPlanner::getDecisionAndClampRange(
8514           [&](ElementCount VF) { return VF.isScalar(); }, Range))
8515     return nullptr;
8516 
8517   if (auto *CI = dyn_cast<CallInst>(Instr))
8518     return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
8519 
8520   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
8521     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
8522 
8523   if (!shouldWiden(Instr, Range))
8524     return nullptr;
8525 
8526   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
8527     return toVPRecipeResult(new VPWidenGEPRecipe(
8528         GEP, make_range(Operands.begin(), Operands.end()), OrigLoop));
8529 
8530   if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8531     bool InvariantCond =
8532         PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
8533     return toVPRecipeResult(new VPWidenSelectRecipe(
8534         *SI, make_range(Operands.begin(), Operands.end()), InvariantCond));
8535   }
8536 
8537   return toVPRecipeResult(tryToWiden(Instr, Operands));
8538 }
8539 
8540 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8541                                                         ElementCount MaxVF) {
8542   assert(OrigLoop->isInnermost() && "Inner loop expected.");
8543 
8544   // Add assume instructions we need to drop to DeadInstructions, to prevent
8545   // them from being added to the VPlan.
8546   // TODO: We only need to drop assumes in blocks that get flattend. If the
8547   // control flow is preserved, we should keep them.
8548   SmallPtrSet<Instruction *, 4> DeadInstructions;
8549   auto &ConditionalAssumes = Legal->getConditionalAssumes();
8550   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
8551 
8552   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
8553   // Dead instructions do not need sinking. Remove them from SinkAfter.
8554   for (Instruction *I : DeadInstructions)
8555     SinkAfter.erase(I);
8556 
8557   // Cannot sink instructions after dead instructions (there won't be any
8558   // recipes for them). Instead, find the first non-dead previous instruction.
8559   for (auto &P : Legal->getSinkAfter()) {
8560     Instruction *SinkTarget = P.second;
8561     Instruction *FirstInst = &*SinkTarget->getParent()->begin();
8562     (void)FirstInst;
8563     while (DeadInstructions.contains(SinkTarget)) {
8564       assert(
8565           SinkTarget != FirstInst &&
8566           "Must find a live instruction (at least the one feeding the "
8567           "first-order recurrence PHI) before reaching beginning of the block");
8568       SinkTarget = SinkTarget->getPrevNode();
8569       assert(SinkTarget != P.first &&
8570              "sink source equals target, no sinking required");
8571     }
8572     P.second = SinkTarget;
8573   }
8574 
8575   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
8576   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
8577     VFRange SubRange = {VF, MaxVFPlusOne};
8578     VPlans.push_back(
8579         buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
8580     VF = SubRange.End;
8581   }
8582 }
8583 
8584 // Add the necessary canonical IV and branch recipes required to control the
8585 // loop.
8586 static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, DebugLoc DL,
8587                                   bool HasNUW,
8588                                   bool UseLaneMaskForLoopControlFlow) {
8589   Value *StartIdx = ConstantInt::get(IdxTy, 0);
8590   auto *StartV = Plan.getOrAddVPValue(StartIdx);
8591 
8592   // Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
8593   auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
8594   VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
8595   VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
8596   Header->insert(CanonicalIVPHI, Header->begin());
8597 
8598   // Add a CanonicalIVIncrement{NUW} VPInstruction to increment the scalar
8599   // IV by VF * UF.
8600   auto *CanonicalIVIncrement =
8601       new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementNUW
8602                                : VPInstruction::CanonicalIVIncrement,
8603                         {CanonicalIVPHI}, DL, "index.next");
8604   CanonicalIVPHI->addOperand(CanonicalIVIncrement);
8605 
8606   VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
8607   EB->appendRecipe(CanonicalIVIncrement);
8608 
8609   if (UseLaneMaskForLoopControlFlow) {
8610     // Create the active lane mask instruction in the vplan preheader.
8611     VPBasicBlock *Preheader = Plan.getEntry()->getEntryBasicBlock();
8612 
8613     // We can't use StartV directly in the ActiveLaneMask VPInstruction, since
8614     // we have to take unrolling into account. Each part needs to start at
8615     //   Part * VF
8616     auto *CanonicalIVIncrementParts =
8617         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8618                                  : VPInstruction::CanonicalIVIncrementForPart,
8619                           {StartV}, DL, "index.part.next");
8620     Preheader->appendRecipe(CanonicalIVIncrementParts);
8621 
8622     // Create the ActiveLaneMask instruction using the correct start values.
8623     VPValue *TC = Plan.getOrCreateTripCount();
8624     auto *EntryALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8625                                        {CanonicalIVIncrementParts, TC}, DL,
8626                                        "active.lane.mask.entry");
8627     Preheader->appendRecipe(EntryALM);
8628 
8629     // Now create the ActiveLaneMaskPhi recipe in the main loop using the
8630     // preheader ActiveLaneMask instruction.
8631     auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
8632     Header->insert(LaneMaskPhi, Header->getFirstNonPhi());
8633 
8634     // Create the active lane mask for the next iteration of the loop.
8635     CanonicalIVIncrementParts =
8636         new VPInstruction(HasNUW ? VPInstruction::CanonicalIVIncrementForPartNUW
8637                                  : VPInstruction::CanonicalIVIncrementForPart,
8638                           {CanonicalIVIncrement}, DL);
8639     EB->appendRecipe(CanonicalIVIncrementParts);
8640 
8641     auto *ALM = new VPInstruction(VPInstruction::ActiveLaneMask,
8642                                   {CanonicalIVIncrementParts, TC}, DL,
8643                                   "active.lane.mask.next");
8644     EB->appendRecipe(ALM);
8645     LaneMaskPhi->addOperand(ALM);
8646 
8647     // We have to invert the mask here because a true condition means jumping
8648     // to the exit block.
8649     auto *NotMask = new VPInstruction(VPInstruction::Not, ALM, DL);
8650     EB->appendRecipe(NotMask);
8651 
8652     VPInstruction *BranchBack =
8653         new VPInstruction(VPInstruction::BranchOnCond, {NotMask}, DL);
8654     EB->appendRecipe(BranchBack);
8655   } else {
8656     // Add the BranchOnCount VPInstruction to the latch.
8657     VPInstruction *BranchBack = new VPInstruction(
8658         VPInstruction::BranchOnCount,
8659         {CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
8660     EB->appendRecipe(BranchBack);
8661   }
8662 }
8663 
8664 // Add exit values to \p Plan. VPLiveOuts are added for each LCSSA phi in the
8665 // original exit block.
8666 static void addUsersInExitBlock(VPBasicBlock *HeaderVPBB,
8667                                 VPBasicBlock *MiddleVPBB, Loop *OrigLoop,
8668                                 VPlan &Plan) {
8669   BasicBlock *ExitBB = OrigLoop->getUniqueExitBlock();
8670   BasicBlock *ExitingBB = OrigLoop->getExitingBlock();
8671   // Only handle single-exit loops with unique exit blocks for now.
8672   if (!ExitBB || !ExitBB->getSinglePredecessor() || !ExitingBB)
8673     return;
8674 
8675   // Introduce VPUsers modeling the exit values.
8676   for (PHINode &ExitPhi : ExitBB->phis()) {
8677     Value *IncomingValue =
8678         ExitPhi.getIncomingValueForBlock(ExitingBB);
8679     VPValue *V = Plan.getOrAddVPValue(IncomingValue, true);
8680     Plan.addLiveOut(&ExitPhi, V);
8681   }
8682 }
8683 
8684 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
8685     VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
8686     const MapVector<Instruction *, Instruction *> &SinkAfter) {
8687 
8688   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8689 
8690   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
8691 
8692   // ---------------------------------------------------------------------------
8693   // Pre-construction: record ingredients whose recipes we'll need to further
8694   // process after constructing the initial VPlan.
8695   // ---------------------------------------------------------------------------
8696 
8697   // Mark instructions we'll need to sink later and their targets as
8698   // ingredients whose recipe we'll need to record.
8699   for (auto &Entry : SinkAfter) {
8700     RecipeBuilder.recordRecipeOf(Entry.first);
8701     RecipeBuilder.recordRecipeOf(Entry.second);
8702   }
8703   for (auto &Reduction : CM.getInLoopReductionChains()) {
8704     PHINode *Phi = Reduction.first;
8705     RecurKind Kind =
8706         Legal->getReductionVars().find(Phi)->second.getRecurrenceKind();
8707     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
8708 
8709     RecipeBuilder.recordRecipeOf(Phi);
8710     for (auto &R : ReductionOperations) {
8711       RecipeBuilder.recordRecipeOf(R);
8712       // For min/max reductions, where we have a pair of icmp/select, we also
8713       // need to record the ICmp recipe, so it can be removed later.
8714       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
8715              "Only min/max recurrences allowed for inloop reductions");
8716       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind))
8717         RecipeBuilder.recordRecipeOf(cast<Instruction>(R->getOperand(0)));
8718     }
8719   }
8720 
8721   // For each interleave group which is relevant for this (possibly trimmed)
8722   // Range, add it to the set of groups to be later applied to the VPlan and add
8723   // placeholders for its members' Recipes which we'll be replacing with a
8724   // single VPInterleaveRecipe.
8725   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8726     auto applyIG = [IG, this](ElementCount VF) -> bool {
8727       return (VF.isVector() && // Query is illegal for VF == 1
8728               CM.getWideningDecision(IG->getInsertPos(), VF) ==
8729                   LoopVectorizationCostModel::CM_Interleave);
8730     };
8731     if (!getDecisionAndClampRange(applyIG, Range))
8732       continue;
8733     InterleaveGroups.insert(IG);
8734     for (unsigned i = 0; i < IG->getFactor(); i++)
8735       if (Instruction *Member = IG->getMember(i))
8736         RecipeBuilder.recordRecipeOf(Member);
8737   };
8738 
8739   // ---------------------------------------------------------------------------
8740   // Build initial VPlan: Scan the body of the loop in a topological order to
8741   // visit each basic block after having visited its predecessor basic blocks.
8742   // ---------------------------------------------------------------------------
8743 
8744   // Create initial VPlan skeleton, starting with a block for the pre-header,
8745   // followed by a region for the vector loop, followed by the middle block. The
8746   // skeleton vector loop region contains a header and latch block.
8747   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
8748   auto Plan = std::make_unique<VPlan>(Preheader);
8749 
8750   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
8751   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
8752   VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
8753   auto *TopRegion = new VPRegionBlock(HeaderVPBB, LatchVPBB, "vector loop");
8754   VPBlockUtils::insertBlockAfter(TopRegion, Preheader);
8755   VPBasicBlock *MiddleVPBB = new VPBasicBlock("middle.block");
8756   VPBlockUtils::insertBlockAfter(MiddleVPBB, TopRegion);
8757 
8758   Instruction *DLInst =
8759       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
8760   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
8761                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
8762                         !CM.foldTailByMasking(),
8763                         CM.useActiveLaneMaskForControlFlow());
8764 
8765   // Scan the body of the loop in a topological order to visit each basic block
8766   // after having visited its predecessor basic blocks.
8767   LoopBlocksDFS DFS(OrigLoop);
8768   DFS.perform(LI);
8769 
8770   VPBasicBlock *VPBB = HeaderVPBB;
8771   SmallVector<VPWidenIntOrFpInductionRecipe *> InductionsToMove;
8772   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
8773     // Relevant instructions from basic block BB will be grouped into VPRecipe
8774     // ingredients and fill a new VPBasicBlock.
8775     unsigned VPBBsForBB = 0;
8776     if (VPBB != HeaderVPBB)
8777       VPBB->setName(BB->getName());
8778     Builder.setInsertPoint(VPBB);
8779 
8780     // Introduce each ingredient into VPlan.
8781     // TODO: Model and preserve debug intrinsics in VPlan.
8782     for (Instruction &I : BB->instructionsWithoutDebug()) {
8783       Instruction *Instr = &I;
8784 
8785       // First filter out irrelevant instructions, to ensure no recipes are
8786       // built for them.
8787       if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
8788         continue;
8789 
8790       SmallVector<VPValue *, 4> Operands;
8791       auto *Phi = dyn_cast<PHINode>(Instr);
8792       if (Phi && Phi->getParent() == OrigLoop->getHeader()) {
8793         Operands.push_back(Plan->getOrAddVPValue(
8794             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
8795       } else {
8796         auto OpRange = Plan->mapToVPValues(Instr->operands());
8797         Operands = {OpRange.begin(), OpRange.end()};
8798       }
8799 
8800       // Invariant stores inside loop will be deleted and a single store
8801       // with the final reduction value will be added to the exit block
8802       StoreInst *SI;
8803       if ((SI = dyn_cast<StoreInst>(&I)) &&
8804           Legal->isInvariantAddressOfReduction(SI->getPointerOperand()))
8805         continue;
8806 
8807       if (auto RecipeOrValue = RecipeBuilder.tryToCreateWidenRecipe(
8808               Instr, Operands, Range, Plan)) {
8809         // If Instr can be simplified to an existing VPValue, use it.
8810         if (RecipeOrValue.is<VPValue *>()) {
8811           auto *VPV = RecipeOrValue.get<VPValue *>();
8812           Plan->addVPValue(Instr, VPV);
8813           // If the re-used value is a recipe, register the recipe for the
8814           // instruction, in case the recipe for Instr needs to be recorded.
8815           if (auto *R = dyn_cast_or_null<VPRecipeBase>(VPV->getDef()))
8816             RecipeBuilder.setRecipe(Instr, R);
8817           continue;
8818         }
8819         // Otherwise, add the new recipe.
8820         VPRecipeBase *Recipe = RecipeOrValue.get<VPRecipeBase *>();
8821         for (auto *Def : Recipe->definedValues()) {
8822           auto *UV = Def->getUnderlyingValue();
8823           Plan->addVPValue(UV, Def);
8824         }
8825 
8826         if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) &&
8827             HeaderVPBB->getFirstNonPhi() != VPBB->end()) {
8828           // Keep track of VPWidenIntOrFpInductionRecipes not in the phi section
8829           // of the header block. That can happen for truncates of induction
8830           // variables. Those recipes are moved to the phi section of the header
8831           // block after applying SinkAfter, which relies on the original
8832           // position of the trunc.
8833           assert(isa<TruncInst>(Instr));
8834           InductionsToMove.push_back(
8835               cast<VPWidenIntOrFpInductionRecipe>(Recipe));
8836         }
8837         RecipeBuilder.setRecipe(Instr, Recipe);
8838         VPBB->appendRecipe(Recipe);
8839         continue;
8840       }
8841 
8842       // Otherwise, if all widening options failed, Instruction is to be
8843       // replicated. This may create a successor for VPBB.
8844       VPBasicBlock *NextVPBB =
8845           RecipeBuilder.handleReplication(Instr, Range, VPBB, Plan);
8846       if (NextVPBB != VPBB) {
8847         VPBB = NextVPBB;
8848         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
8849                                     : "");
8850       }
8851     }
8852 
8853     VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
8854     VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
8855   }
8856 
8857   HeaderVPBB->setName("vector.body");
8858 
8859   // Fold the last, empty block into its predecessor.
8860   VPBB = VPBlockUtils::tryToMergeBlockIntoPredecessor(VPBB);
8861   assert(VPBB && "expected to fold last (empty) block");
8862   // After here, VPBB should not be used.
8863   VPBB = nullptr;
8864 
8865   addUsersInExitBlock(HeaderVPBB, MiddleVPBB, OrigLoop, *Plan);
8866 
8867   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8868          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8869          "entry block must be set to a VPRegionBlock having a non-empty entry "
8870          "VPBasicBlock");
8871   RecipeBuilder.fixHeaderPhis();
8872 
8873   // ---------------------------------------------------------------------------
8874   // Transform initial VPlan: Apply previously taken decisions, in order, to
8875   // bring the VPlan to its final state.
8876   // ---------------------------------------------------------------------------
8877 
8878   // Apply Sink-After legal constraints.
8879   auto GetReplicateRegion = [](VPRecipeBase *R) -> VPRegionBlock * {
8880     auto *Region = dyn_cast_or_null<VPRegionBlock>(R->getParent()->getParent());
8881     if (Region && Region->isReplicator()) {
8882       assert(Region->getNumSuccessors() == 1 &&
8883              Region->getNumPredecessors() == 1 && "Expected SESE region!");
8884       assert(R->getParent()->size() == 1 &&
8885              "A recipe in an original replicator region must be the only "
8886              "recipe in its block");
8887       return Region;
8888     }
8889     return nullptr;
8890   };
8891   for (auto &Entry : SinkAfter) {
8892     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
8893     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
8894 
8895     auto *TargetRegion = GetReplicateRegion(Target);
8896     auto *SinkRegion = GetReplicateRegion(Sink);
8897     if (!SinkRegion) {
8898       // If the sink source is not a replicate region, sink the recipe directly.
8899       if (TargetRegion) {
8900         // The target is in a replication region, make sure to move Sink to
8901         // the block after it, not into the replication region itself.
8902         VPBasicBlock *NextBlock =
8903             cast<VPBasicBlock>(TargetRegion->getSuccessors().front());
8904         Sink->moveBefore(*NextBlock, NextBlock->getFirstNonPhi());
8905       } else
8906         Sink->moveAfter(Target);
8907       continue;
8908     }
8909 
8910     // The sink source is in a replicate region. Unhook the region from the CFG.
8911     auto *SinkPred = SinkRegion->getSinglePredecessor();
8912     auto *SinkSucc = SinkRegion->getSingleSuccessor();
8913     VPBlockUtils::disconnectBlocks(SinkPred, SinkRegion);
8914     VPBlockUtils::disconnectBlocks(SinkRegion, SinkSucc);
8915     VPBlockUtils::connectBlocks(SinkPred, SinkSucc);
8916 
8917     if (TargetRegion) {
8918       // The target recipe is also in a replicate region, move the sink region
8919       // after the target region.
8920       auto *TargetSucc = TargetRegion->getSingleSuccessor();
8921       VPBlockUtils::disconnectBlocks(TargetRegion, TargetSucc);
8922       VPBlockUtils::connectBlocks(TargetRegion, SinkRegion);
8923       VPBlockUtils::connectBlocks(SinkRegion, TargetSucc);
8924     } else {
8925       // The sink source is in a replicate region, we need to move the whole
8926       // replicate region, which should only contain a single recipe in the
8927       // main block.
8928       auto *SplitBlock =
8929           Target->getParent()->splitAt(std::next(Target->getIterator()));
8930 
8931       auto *SplitPred = SplitBlock->getSinglePredecessor();
8932 
8933       VPBlockUtils::disconnectBlocks(SplitPred, SplitBlock);
8934       VPBlockUtils::connectBlocks(SplitPred, SinkRegion);
8935       VPBlockUtils::connectBlocks(SinkRegion, SplitBlock);
8936     }
8937   }
8938 
8939   VPlanTransforms::removeRedundantCanonicalIVs(*Plan);
8940   VPlanTransforms::removeRedundantInductionCasts(*Plan);
8941 
8942   // Now that sink-after is done, move induction recipes for optimized truncates
8943   // to the phi section of the header block.
8944   for (VPWidenIntOrFpInductionRecipe *Ind : InductionsToMove)
8945     Ind->moveBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
8946 
8947   // Adjust the recipes for any inloop reductions.
8948   adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
8949                              RecipeBuilder, Range.Start);
8950 
8951   // Introduce a recipe to combine the incoming and previous values of a
8952   // first-order recurrence.
8953   for (VPRecipeBase &R :
8954        Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8955     auto *RecurPhi = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R);
8956     if (!RecurPhi)
8957       continue;
8958 
8959     VPRecipeBase *PrevRecipe = RecurPhi->getBackedgeRecipe();
8960     VPBasicBlock *InsertBlock = PrevRecipe->getParent();
8961     auto *Region = GetReplicateRegion(PrevRecipe);
8962     if (Region)
8963       InsertBlock = dyn_cast<VPBasicBlock>(Region->getSingleSuccessor());
8964     if (!InsertBlock) {
8965       InsertBlock = new VPBasicBlock(Region->getName() + ".succ");
8966       VPBlockUtils::insertBlockAfter(InsertBlock, Region);
8967     }
8968     if (Region || PrevRecipe->isPhi())
8969       Builder.setInsertPoint(InsertBlock, InsertBlock->getFirstNonPhi());
8970     else
8971       Builder.setInsertPoint(InsertBlock, std::next(PrevRecipe->getIterator()));
8972 
8973     auto *RecurSplice = cast<VPInstruction>(
8974         Builder.createNaryOp(VPInstruction::FirstOrderRecurrenceSplice,
8975                              {RecurPhi, RecurPhi->getBackedgeValue()}));
8976 
8977     RecurPhi->replaceAllUsesWith(RecurSplice);
8978     // Set the first operand of RecurSplice to RecurPhi again, after replacing
8979     // all users.
8980     RecurSplice->setOperand(0, RecurPhi);
8981   }
8982 
8983   // Interleave memory: for each Interleave Group we marked earlier as relevant
8984   // for this VPlan, replace the Recipes widening its memory instructions with a
8985   // single VPInterleaveRecipe at its insertion point.
8986   for (auto IG : InterleaveGroups) {
8987     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
8988         RecipeBuilder.getRecipe(IG->getInsertPos()));
8989     SmallVector<VPValue *, 4> StoredValues;
8990     for (unsigned i = 0; i < IG->getFactor(); ++i)
8991       if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i))) {
8992         auto *StoreR =
8993             cast<VPWidenMemoryInstructionRecipe>(RecipeBuilder.getRecipe(SI));
8994         StoredValues.push_back(StoreR->getStoredValue());
8995       }
8996 
8997     auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
8998                                         Recipe->getMask());
8999     VPIG->insertBefore(Recipe);
9000     unsigned J = 0;
9001     for (unsigned i = 0; i < IG->getFactor(); ++i)
9002       if (Instruction *Member = IG->getMember(i)) {
9003         if (!Member->getType()->isVoidTy()) {
9004           VPValue *OriginalV = Plan->getVPValue(Member);
9005           Plan->removeVPValueFor(Member);
9006           Plan->addVPValue(Member, VPIG->getVPValue(J));
9007           OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
9008           J++;
9009         }
9010         RecipeBuilder.getRecipe(Member)->eraseFromParent();
9011       }
9012   }
9013 
9014   std::string PlanName;
9015   raw_string_ostream RSO(PlanName);
9016   ElementCount VF = Range.Start;
9017   Plan->addVF(VF);
9018   RSO << "Initial VPlan for VF={" << VF;
9019   for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
9020     Plan->addVF(VF);
9021     RSO << "," << VF;
9022   }
9023   RSO << "},UF>=1";
9024   RSO.flush();
9025   Plan->setName(PlanName);
9026 
9027   // From this point onwards, VPlan-to-VPlan transformations may change the plan
9028   // in ways that accessing values using original IR values is incorrect.
9029   Plan->disableValue2VPValue();
9030 
9031   VPlanTransforms::optimizeInductions(*Plan, *PSE.getSE());
9032   VPlanTransforms::sinkScalarOperands(*Plan);
9033   VPlanTransforms::removeDeadRecipes(*Plan);
9034   VPlanTransforms::mergeReplicateRegions(*Plan);
9035   VPlanTransforms::removeRedundantExpandSCEVRecipes(*Plan);
9036 
9037   // Fold Exit block into its predecessor if possible.
9038   // TODO: Fold block earlier once all VPlan transforms properly maintain a
9039   // VPBasicBlock as exit.
9040   VPBlockUtils::tryToMergeBlockIntoPredecessor(TopRegion->getExiting());
9041 
9042   assert(VPlanVerifier::verifyPlanIsValid(*Plan) && "VPlan is invalid");
9043   return Plan;
9044 }
9045 
9046 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
9047   // Outer loop handling: They may require CFG and instruction level
9048   // transformations before even evaluating whether vectorization is profitable.
9049   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
9050   // the vectorization pipeline.
9051   assert(!OrigLoop->isInnermost());
9052   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
9053 
9054   // Create new empty VPlan
9055   auto Plan = std::make_unique<VPlan>();
9056 
9057   // Build hierarchical CFG
9058   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9059   HCFGBuilder.buildHierarchicalCFG();
9060 
9061   for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
9062        VF *= 2)
9063     Plan->addVF(VF);
9064 
9065   SmallPtrSet<Instruction *, 1> DeadInstructions;
9066   VPlanTransforms::VPInstructionsToVPRecipes(
9067       OrigLoop, Plan,
9068       [this](PHINode *P) { return Legal->getIntOrFpInductionDescriptor(P); },
9069       DeadInstructions, *PSE.getSE());
9070 
9071   // Remove the existing terminator of the exiting block of the top-most region.
9072   // A BranchOnCount will be added instead when adding the canonical IV recipes.
9073   auto *Term =
9074       Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
9075   Term->eraseFromParent();
9076 
9077   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
9078                         true, CM.useActiveLaneMaskForControlFlow());
9079   return Plan;
9080 }
9081 
9082 // Adjust the recipes for reductions. For in-loop reductions the chain of
9083 // instructions leading from the loop exit instr to the phi need to be converted
9084 // to reductions, with one operand being vector and the other being the scalar
9085 // reduction chain. For other reductions, a select is introduced between the phi
9086 // and live-out recipes when folding the tail.
9087 void LoopVectorizationPlanner::adjustRecipesForReductions(
9088     VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
9089     ElementCount MinVF) {
9090   for (auto &Reduction : CM.getInLoopReductionChains()) {
9091     PHINode *Phi = Reduction.first;
9092     const RecurrenceDescriptor &RdxDesc =
9093         Legal->getReductionVars().find(Phi)->second;
9094     const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
9095 
9096     if (MinVF.isScalar() && !CM.useOrderedReductions(RdxDesc))
9097       continue;
9098 
9099     // ReductionOperations are orders top-down from the phi's use to the
9100     // LoopExitValue. We keep a track of the previous item (the Chain) to tell
9101     // which of the two operands will remain scalar and which will be reduced.
9102     // For minmax the chain will be the select instructions.
9103     Instruction *Chain = Phi;
9104     for (Instruction *R : ReductionOperations) {
9105       VPRecipeBase *WidenRecipe = RecipeBuilder.getRecipe(R);
9106       RecurKind Kind = RdxDesc.getRecurrenceKind();
9107 
9108       VPValue *ChainOp = Plan->getVPValue(Chain);
9109       unsigned FirstOpId;
9110       assert(!RecurrenceDescriptor::isSelectCmpRecurrenceKind(Kind) &&
9111              "Only min/max recurrences allowed for inloop reductions");
9112       // Recognize a call to the llvm.fmuladd intrinsic.
9113       bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9114       assert((!IsFMulAdd || RecurrenceDescriptor::isFMulAddIntrinsic(R)) &&
9115              "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9116       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9117         assert(isa<VPWidenSelectRecipe>(WidenRecipe) &&
9118                "Expected to replace a VPWidenSelectSC");
9119         FirstOpId = 1;
9120       } else {
9121         assert((MinVF.isScalar() || isa<VPWidenRecipe>(WidenRecipe) ||
9122                 (IsFMulAdd && isa<VPWidenCallRecipe>(WidenRecipe))) &&
9123                "Expected to replace a VPWidenSC");
9124         FirstOpId = 0;
9125       }
9126       unsigned VecOpId =
9127           R->getOperand(FirstOpId) == Chain ? FirstOpId + 1 : FirstOpId;
9128       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
9129 
9130       auto *CondOp = CM.blockNeedsPredicationForAnyReason(R->getParent())
9131                          ? RecipeBuilder.createBlockInMask(R->getParent(), Plan)
9132                          : nullptr;
9133 
9134       if (IsFMulAdd) {
9135         // If the instruction is a call to the llvm.fmuladd intrinsic then we
9136         // need to create an fmul recipe to use as the vector operand for the
9137         // fadd reduction.
9138         VPInstruction *FMulRecipe = new VPInstruction(
9139             Instruction::FMul, {VecOp, Plan->getVPValue(R->getOperand(1))});
9140         FMulRecipe->setFastMathFlags(R->getFastMathFlags());
9141         WidenRecipe->getParent()->insert(FMulRecipe,
9142                                          WidenRecipe->getIterator());
9143         VecOp = FMulRecipe;
9144       }
9145       VPReductionRecipe *RedRecipe =
9146           new VPReductionRecipe(&RdxDesc, R, ChainOp, VecOp, CondOp, TTI);
9147       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9148       Plan->removeVPValueFor(R);
9149       Plan->addVPValue(R, RedRecipe);
9150       // Append the recipe to the end of the VPBasicBlock because we need to
9151       // ensure that it comes after all of it's inputs, including CondOp.
9152       WidenRecipe->getParent()->appendRecipe(RedRecipe);
9153       WidenRecipe->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
9154       WidenRecipe->eraseFromParent();
9155 
9156       if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9157         VPRecipeBase *CompareRecipe =
9158             RecipeBuilder.getRecipe(cast<Instruction>(R->getOperand(0)));
9159         assert(isa<VPWidenRecipe>(CompareRecipe) &&
9160                "Expected to replace a VPWidenSC");
9161         assert(cast<VPWidenRecipe>(CompareRecipe)->getNumUsers() == 0 &&
9162                "Expected no remaining users");
9163         CompareRecipe->eraseFromParent();
9164       }
9165       Chain = R;
9166     }
9167   }
9168 
9169   // If tail is folded by masking, introduce selects between the phi
9170   // and the live-out instruction of each reduction, at the beginning of the
9171   // dedicated latch block.
9172   if (CM.foldTailByMasking()) {
9173     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
9174     for (VPRecipeBase &R :
9175          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9176       VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
9177       if (!PhiR || PhiR->isInLoop())
9178         continue;
9179       VPValue *Cond =
9180           RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
9181       VPValue *Red = PhiR->getBackedgeValue();
9182       assert(cast<VPRecipeBase>(Red->getDef())->getParent() != LatchVPBB &&
9183              "reduction recipe must be defined before latch");
9184       Builder.createNaryOp(Instruction::Select, {Cond, Red, PhiR});
9185     }
9186   }
9187 }
9188 
9189 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
9190 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
9191                                VPSlotTracker &SlotTracker) const {
9192   O << Indent << "INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
9193   IG->getInsertPos()->printAsOperand(O, false);
9194   O << ", ";
9195   getAddr()->printAsOperand(O, SlotTracker);
9196   VPValue *Mask = getMask();
9197   if (Mask) {
9198     O << ", ";
9199     Mask->printAsOperand(O, SlotTracker);
9200   }
9201 
9202   unsigned OpIdx = 0;
9203   for (unsigned i = 0; i < IG->getFactor(); ++i) {
9204     if (!IG->getMember(i))
9205       continue;
9206     if (getNumStoreOperands() > 0) {
9207       O << "\n" << Indent << "  store ";
9208       getOperand(1 + OpIdx)->printAsOperand(O, SlotTracker);
9209       O << " to index " << i;
9210     } else {
9211       O << "\n" << Indent << "  ";
9212       getVPValue(OpIdx)->printAsOperand(O, SlotTracker);
9213       O << " = load from index " << i;
9214     }
9215     ++OpIdx;
9216   }
9217 }
9218 #endif
9219 
9220 void VPWidenCallRecipe::execute(VPTransformState &State) {
9221   State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
9222                                   *this, State);
9223 }
9224 
9225 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
9226   assert(!State.Instance && "Int or FP induction being replicated.");
9227 
9228   Value *Start = getStartValue()->getLiveInIRValue();
9229   const InductionDescriptor &ID = getInductionDescriptor();
9230   TruncInst *Trunc = getTruncInst();
9231   IRBuilderBase &Builder = State.Builder;
9232   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
9233   assert(State.VF.isVector() && "must have vector VF");
9234 
9235   // The value from the original loop to which we are mapping the new induction
9236   // variable.
9237   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
9238 
9239   // Fast-math-flags propagate from the original induction instruction.
9240   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
9241   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
9242     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
9243 
9244   // Now do the actual transformations, and start with fetching the step value.
9245   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9246 
9247   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
9248          "Expected either an induction phi-node or a truncate of it!");
9249 
9250   // Construct the initial value of the vector IV in the vector loop preheader
9251   auto CurrIP = Builder.saveIP();
9252   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9253   Builder.SetInsertPoint(VectorPH->getTerminator());
9254   if (isa<TruncInst>(EntryVal)) {
9255     assert(Start->getType()->isIntegerTy() &&
9256            "Truncation requires an integer type");
9257     auto *TruncType = cast<IntegerType>(EntryVal->getType());
9258     Step = Builder.CreateTrunc(Step, TruncType);
9259     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
9260   }
9261 
9262   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
9263   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
9264   Value *SteppedStart = getStepVector(
9265       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
9266 
9267   // We create vector phi nodes for both integer and floating-point induction
9268   // variables. Here, we determine the kind of arithmetic we will perform.
9269   Instruction::BinaryOps AddOp;
9270   Instruction::BinaryOps MulOp;
9271   if (Step->getType()->isIntegerTy()) {
9272     AddOp = Instruction::Add;
9273     MulOp = Instruction::Mul;
9274   } else {
9275     AddOp = ID.getInductionOpcode();
9276     MulOp = Instruction::FMul;
9277   }
9278 
9279   // Multiply the vectorization factor by the step using integer or
9280   // floating-point arithmetic as appropriate.
9281   Type *StepType = Step->getType();
9282   Value *RuntimeVF;
9283   if (Step->getType()->isFloatingPointTy())
9284     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
9285   else
9286     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
9287   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
9288 
9289   // Create a vector splat to use in the induction update.
9290   //
9291   // FIXME: If the step is non-constant, we create the vector splat with
9292   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
9293   //        handle a constant vector splat.
9294   Value *SplatVF = isa<Constant>(Mul)
9295                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
9296                        : Builder.CreateVectorSplat(State.VF, Mul);
9297   Builder.restoreIP(CurrIP);
9298 
9299   // We may need to add the step a number of times, depending on the unroll
9300   // factor. The last of those goes into the PHI.
9301   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
9302                                     &*State.CFG.PrevBB->getFirstInsertionPt());
9303   VecInd->setDebugLoc(EntryVal->getDebugLoc());
9304   Instruction *LastInduction = VecInd;
9305   for (unsigned Part = 0; Part < State.UF; ++Part) {
9306     State.set(this, LastInduction, Part);
9307 
9308     if (isa<TruncInst>(EntryVal))
9309       State.addMetadata(LastInduction, EntryVal);
9310 
9311     LastInduction = cast<Instruction>(
9312         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
9313     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
9314   }
9315 
9316   LastInduction->setName("vec.ind.next");
9317   VecInd->addIncoming(SteppedStart, VectorPH);
9318   // Add induction update using an incorrect block temporarily. The phi node
9319   // will be fixed after VPlan execution. Note that at this point the latch
9320   // block cannot be used, as it does not exist yet.
9321   // TODO: Model increment value in VPlan, by turning the recipe into a
9322   // multi-def and a subclass of VPHeaderPHIRecipe.
9323   VecInd->addIncoming(LastInduction, VectorPH);
9324 }
9325 
9326 void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
9327   assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
9328          "Not a pointer induction according to InductionDescriptor!");
9329   assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
9330          "Unexpected type.");
9331 
9332   auto *IVR = getParent()->getPlan()->getCanonicalIV();
9333   PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0));
9334 
9335   if (onlyScalarsGenerated(State.VF)) {
9336     // This is the normalized GEP that starts counting at zero.
9337     Value *PtrInd = State.Builder.CreateSExtOrTrunc(
9338         CanonicalIV, IndDesc.getStep()->getType());
9339     // Determine the number of scalars we need to generate for each unroll
9340     // iteration. If the instruction is uniform, we only need to generate the
9341     // first lane. Otherwise, we generate all VF values.
9342     bool IsUniform = vputils::onlyFirstLaneUsed(this);
9343     assert((IsUniform || !State.VF.isScalable()) &&
9344            "Cannot scalarize a scalable VF");
9345     unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
9346 
9347     for (unsigned Part = 0; Part < State.UF; ++Part) {
9348       Value *PartStart =
9349           createStepForVF(State.Builder, PtrInd->getType(), State.VF, Part);
9350 
9351       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
9352         Value *Idx = State.Builder.CreateAdd(
9353             PartStart, ConstantInt::get(PtrInd->getType(), Lane));
9354         Value *GlobalIdx = State.Builder.CreateAdd(PtrInd, Idx);
9355 
9356         Value *Step = CreateStepValue(IndDesc.getStep(), SE,
9357                                       State.CFG.PrevBB->getTerminator());
9358         Value *SclrGep = emitTransformedIndex(
9359             State.Builder, GlobalIdx, IndDesc.getStartValue(), Step, IndDesc);
9360         SclrGep->setName("next.gep");
9361         State.set(this, SclrGep, VPIteration(Part, Lane));
9362       }
9363     }
9364     return;
9365   }
9366 
9367   assert(isa<SCEVConstant>(IndDesc.getStep()) &&
9368          "Induction step not a SCEV constant!");
9369   Type *PhiType = IndDesc.getStep()->getType();
9370 
9371   // Build a pointer phi
9372   Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
9373   Type *ScStValueType = ScalarStartValue->getType();
9374   PHINode *NewPointerPhi =
9375       PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
9376 
9377   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
9378   NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
9379 
9380   // A pointer induction, performed by using a gep
9381   const DataLayout &DL = NewPointerPhi->getModule()->getDataLayout();
9382   Instruction *InductionLoc = &*State.Builder.GetInsertPoint();
9383 
9384   const SCEV *ScalarStep = IndDesc.getStep();
9385   SCEVExpander Exp(SE, DL, "induction");
9386   Value *ScalarStepValue = Exp.expandCodeFor(ScalarStep, PhiType, InductionLoc);
9387   Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
9388   Value *NumUnrolledElems =
9389       State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, State.UF));
9390   Value *InductionGEP = GetElementPtrInst::Create(
9391       IndDesc.getElementType(), NewPointerPhi,
9392       State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
9393       InductionLoc);
9394   // Add induction update using an incorrect block temporarily. The phi node
9395   // will be fixed after VPlan execution. Note that at this point the latch
9396   // block cannot be used, as it does not exist yet.
9397   // TODO: Model increment value in VPlan, by turning the recipe into a
9398   // multi-def and a subclass of VPHeaderPHIRecipe.
9399   NewPointerPhi->addIncoming(InductionGEP, VectorPH);
9400 
9401   // Create UF many actual address geps that use the pointer
9402   // phi as base and a vectorized version of the step value
9403   // (<step*0, ..., step*N>) as offset.
9404   for (unsigned Part = 0; Part < State.UF; ++Part) {
9405     Type *VecPhiType = VectorType::get(PhiType, State.VF);
9406     Value *StartOffsetScalar =
9407         State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, Part));
9408     Value *StartOffset =
9409         State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
9410     // Create a vector of consecutive numbers from zero to VF.
9411     StartOffset = State.Builder.CreateAdd(
9412         StartOffset, State.Builder.CreateStepVector(VecPhiType));
9413 
9414     Value *GEP = State.Builder.CreateGEP(
9415         IndDesc.getElementType(), NewPointerPhi,
9416         State.Builder.CreateMul(
9417             StartOffset,
9418             State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
9419             "vector.gep"));
9420     State.set(this, GEP, Part);
9421   }
9422 }
9423 
9424 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
9425   assert(!State.Instance && "VPScalarIVStepsRecipe being replicated.");
9426 
9427   // Fast-math-flags propagate from the original induction instruction.
9428   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9429   if (IndDesc.getInductionBinOp() &&
9430       isa<FPMathOperator>(IndDesc.getInductionBinOp()))
9431     State.Builder.setFastMathFlags(
9432         IndDesc.getInductionBinOp()->getFastMathFlags());
9433 
9434   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
9435   auto CreateScalarIV = [&](Value *&Step) -> Value * {
9436     Value *ScalarIV = State.get(getCanonicalIV(), VPIteration(0, 0));
9437     auto *CanonicalIV = State.get(getParent()->getPlan()->getCanonicalIV(), 0);
9438     if (!isCanonical() || CanonicalIV->getType() != Ty) {
9439       ScalarIV =
9440           Ty->isIntegerTy()
9441               ? State.Builder.CreateSExtOrTrunc(ScalarIV, Ty)
9442               : State.Builder.CreateCast(Instruction::SIToFP, ScalarIV, Ty);
9443       ScalarIV = emitTransformedIndex(State.Builder, ScalarIV,
9444                                       getStartValue()->getLiveInIRValue(), Step,
9445                                       IndDesc);
9446       ScalarIV->setName("offset.idx");
9447     }
9448     if (TruncToTy) {
9449       assert(Step->getType()->isIntegerTy() &&
9450              "Truncation requires an integer step");
9451       ScalarIV = State.Builder.CreateTrunc(ScalarIV, TruncToTy);
9452       Step = State.Builder.CreateTrunc(Step, TruncToTy);
9453     }
9454     return ScalarIV;
9455   };
9456 
9457   Value *ScalarIV = CreateScalarIV(Step);
9458   if (State.VF.isVector()) {
9459     buildScalarSteps(ScalarIV, Step, IndDesc, this, State);
9460     return;
9461   }
9462 
9463   for (unsigned Part = 0; Part < State.UF; ++Part) {
9464     assert(!State.VF.isScalable() && "scalable vectors not yet supported.");
9465     Value *EntryPart;
9466     if (Step->getType()->isFloatingPointTy()) {
9467       Value *StartIdx =
9468           getRuntimeVFAsFloat(State.Builder, Step->getType(), State.VF * Part);
9469       // Floating-point operations inherit FMF via the builder's flags.
9470       Value *MulOp = State.Builder.CreateFMul(StartIdx, Step);
9471       EntryPart = State.Builder.CreateBinOp(IndDesc.getInductionOpcode(),
9472                                             ScalarIV, MulOp);
9473     } else {
9474       Value *StartIdx =
9475           getRuntimeVF(State.Builder, Step->getType(), State.VF * Part);
9476       EntryPart = State.Builder.CreateAdd(
9477           ScalarIV, State.Builder.CreateMul(StartIdx, Step), "induction");
9478     }
9479     State.set(this, EntryPart, Part);
9480   }
9481 }
9482 
9483 void VPInterleaveRecipe::execute(VPTransformState &State) {
9484   assert(!State.Instance && "Interleave group being replicated.");
9485   State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
9486                                       getStoredValues(), getMask());
9487 }
9488 
9489 void VPReductionRecipe::execute(VPTransformState &State) {
9490   assert(!State.Instance && "Reduction being replicated.");
9491   Value *PrevInChain = State.get(getChainOp(), 0);
9492   RecurKind Kind = RdxDesc->getRecurrenceKind();
9493   bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc);
9494   // Propagate the fast-math flags carried by the underlying instruction.
9495   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
9496   State.Builder.setFastMathFlags(RdxDesc->getFastMathFlags());
9497   for (unsigned Part = 0; Part < State.UF; ++Part) {
9498     Value *NewVecOp = State.get(getVecOp(), Part);
9499     if (VPValue *Cond = getCondOp()) {
9500       Value *NewCond = State.get(Cond, Part);
9501       VectorType *VecTy = cast<VectorType>(NewVecOp->getType());
9502       Value *Iden = RdxDesc->getRecurrenceIdentity(
9503           Kind, VecTy->getElementType(), RdxDesc->getFastMathFlags());
9504       Value *IdenVec =
9505           State.Builder.CreateVectorSplat(VecTy->getElementCount(), Iden);
9506       Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec);
9507       NewVecOp = Select;
9508     }
9509     Value *NewRed;
9510     Value *NextInChain;
9511     if (IsOrdered) {
9512       if (State.VF.isVector())
9513         NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp,
9514                                         PrevInChain);
9515       else
9516         NewRed = State.Builder.CreateBinOp(
9517             (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), PrevInChain,
9518             NewVecOp);
9519       PrevInChain = NewRed;
9520     } else {
9521       PrevInChain = State.get(getChainOp(), Part);
9522       NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp);
9523     }
9524     if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9525       NextInChain =
9526           createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(),
9527                          NewRed, PrevInChain);
9528     } else if (IsOrdered)
9529       NextInChain = NewRed;
9530     else
9531       NextInChain = State.Builder.CreateBinOp(
9532           (Instruction::BinaryOps)RdxDesc->getOpcode(Kind), NewRed,
9533           PrevInChain);
9534     State.set(this, NextInChain, Part);
9535   }
9536 }
9537 
9538 void VPReplicateRecipe::execute(VPTransformState &State) {
9539   if (State.Instance) { // Generate a single instance.
9540     assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9541     State.ILV->scalarizeInstruction(getUnderlyingInstr(), this, *State.Instance,
9542                                     IsPredicated, State);
9543     // Insert scalar instance packing it into a vector.
9544     if (AlsoPack && State.VF.isVector()) {
9545       // If we're constructing lane 0, initialize to start from poison.
9546       if (State.Instance->Lane.isFirstLane()) {
9547         assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
9548         Value *Poison = PoisonValue::get(
9549             VectorType::get(getUnderlyingValue()->getType(), State.VF));
9550         State.set(this, Poison, State.Instance->Part);
9551       }
9552       State.ILV->packScalarIntoVectorValue(this, *State.Instance, State);
9553     }
9554     return;
9555   }
9556 
9557   if (IsUniform) {
9558     // Uniform within VL means we need to generate lane 0 only for each
9559     // unrolled copy.
9560     for (unsigned Part = 0; Part < State.UF; ++Part)
9561       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9562                                       VPIteration(Part, 0), IsPredicated,
9563                                       State);
9564     return;
9565   }
9566 
9567   // Generate scalar instances for all VF lanes of all UF parts.
9568   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
9569   const unsigned EndLane = State.VF.getKnownMinValue();
9570   for (unsigned Part = 0; Part < State.UF; ++Part)
9571     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
9572       State.ILV->scalarizeInstruction(getUnderlyingInstr(), this,
9573                                       VPIteration(Part, Lane), IsPredicated,
9574                                       State);
9575 }
9576 
9577 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
9578   VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
9579 
9580   // Attempt to issue a wide load.
9581   LoadInst *LI = dyn_cast<LoadInst>(&Ingredient);
9582   StoreInst *SI = dyn_cast<StoreInst>(&Ingredient);
9583 
9584   assert((LI || SI) && "Invalid Load/Store instruction");
9585   assert((!SI || StoredValue) && "No stored value provided for widened store");
9586   assert((!LI || !StoredValue) && "Stored value provided for widened load");
9587 
9588   Type *ScalarDataTy = getLoadStoreType(&Ingredient);
9589 
9590   auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
9591   const Align Alignment = getLoadStoreAlignment(&Ingredient);
9592   bool CreateGatherScatter = !Consecutive;
9593 
9594   auto &Builder = State.Builder;
9595   InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF);
9596   bool isMaskRequired = getMask();
9597   if (isMaskRequired)
9598     for (unsigned Part = 0; Part < State.UF; ++Part)
9599       BlockInMaskParts[Part] = State.get(getMask(), Part);
9600 
9601   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
9602     // Calculate the pointer for the specific unroll-part.
9603     GetElementPtrInst *PartPtr = nullptr;
9604 
9605     bool InBounds = false;
9606     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
9607       InBounds = gep->isInBounds();
9608     if (Reverse) {
9609       // If the address is consecutive but reversed, then the
9610       // wide store needs to start at the last vector element.
9611       // RunTimeVF =  VScale * VF.getKnownMinValue()
9612       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
9613       Value *RunTimeVF = getRuntimeVF(Builder, Builder.getInt32Ty(), State.VF);
9614       // NumElt = -Part * RunTimeVF
9615       Value *NumElt = Builder.CreateMul(Builder.getInt32(-Part), RunTimeVF);
9616       // LastLane = 1 - RunTimeVF
9617       Value *LastLane = Builder.CreateSub(Builder.getInt32(1), RunTimeVF);
9618       PartPtr =
9619           cast<GetElementPtrInst>(Builder.CreateGEP(ScalarDataTy, Ptr, NumElt));
9620       PartPtr->setIsInBounds(InBounds);
9621       PartPtr = cast<GetElementPtrInst>(
9622           Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane));
9623       PartPtr->setIsInBounds(InBounds);
9624       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
9625         BlockInMaskParts[Part] =
9626             Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse");
9627     } else {
9628       Value *Increment =
9629           createStepForVF(Builder, Builder.getInt32Ty(), State.VF, Part);
9630       PartPtr = cast<GetElementPtrInst>(
9631           Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
9632       PartPtr->setIsInBounds(InBounds);
9633     }
9634 
9635     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
9636     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
9637   };
9638 
9639   // Handle Stores:
9640   if (SI) {
9641     State.setDebugLocFromInst(SI);
9642 
9643     for (unsigned Part = 0; Part < State.UF; ++Part) {
9644       Instruction *NewSI = nullptr;
9645       Value *StoredVal = State.get(StoredValue, Part);
9646       if (CreateGatherScatter) {
9647         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9648         Value *VectorGep = State.get(getAddr(), Part);
9649         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
9650                                             MaskPart);
9651       } else {
9652         if (Reverse) {
9653           // If we store to reverse consecutive memory locations, then we need
9654           // to reverse the order of elements in the stored value.
9655           StoredVal = Builder.CreateVectorReverse(StoredVal, "reverse");
9656           // We don't want to update the value in the map as it might be used in
9657           // another expression. So don't call resetVectorValue(StoredVal).
9658         }
9659         auto *VecPtr =
9660             CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9661         if (isMaskRequired)
9662           NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
9663                                             BlockInMaskParts[Part]);
9664         else
9665           NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
9666       }
9667       State.addMetadata(NewSI, SI);
9668     }
9669     return;
9670   }
9671 
9672   // Handle loads.
9673   assert(LI && "Must have a load instruction");
9674   State.setDebugLocFromInst(LI);
9675   for (unsigned Part = 0; Part < State.UF; ++Part) {
9676     Value *NewLI;
9677     if (CreateGatherScatter) {
9678       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
9679       Value *VectorGep = State.get(getAddr(), Part);
9680       NewLI = Builder.CreateMaskedGather(DataTy, VectorGep, Alignment, MaskPart,
9681                                          nullptr, "wide.masked.gather");
9682       State.addMetadata(NewLI, LI);
9683     } else {
9684       auto *VecPtr =
9685           CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0)));
9686       if (isMaskRequired)
9687         NewLI = Builder.CreateMaskedLoad(
9688             DataTy, VecPtr, Alignment, BlockInMaskParts[Part],
9689             PoisonValue::get(DataTy), "wide.masked.load");
9690       else
9691         NewLI =
9692             Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
9693 
9694       // Add metadata to the load, but setVectorValue to the reverse shuffle.
9695       State.addMetadata(NewLI, LI);
9696       if (Reverse)
9697         NewLI = Builder.CreateVectorReverse(NewLI, "reverse");
9698     }
9699 
9700     State.set(getVPSingleValue(), NewLI, Part);
9701   }
9702 }
9703 
9704 // Determine how to lower the scalar epilogue, which depends on 1) optimising
9705 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9706 // predication, and 4) a TTI hook that analyses whether the loop is suitable
9707 // for predication.
9708 static ScalarEpilogueLowering getScalarEpilogueLowering(
9709     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9710     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9711     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
9712     LoopVectorizationLegality &LVL) {
9713   // 1) OptSize takes precedence over all other options, i.e. if this is set,
9714   // don't look at hints or options, and don't request a scalar epilogue.
9715   // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9716   // LoopAccessInfo (due to code dependency and not being able to reliably get
9717   // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9718   // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9719   // versioning when the vectorization is forced, unlike hasOptSize. So revert
9720   // back to the old way and vectorize with versioning when forced. See D81345.)
9721   if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
9722                                                       PGSOQueryType::IRPass) &&
9723                           Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9724     return CM_ScalarEpilogueNotAllowedOptSize;
9725 
9726   // 2) If set, obey the directives
9727   if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9728     switch (PreferPredicateOverEpilogue) {
9729     case PreferPredicateTy::ScalarEpilogue:
9730       return CM_ScalarEpilogueAllowed;
9731     case PreferPredicateTy::PredicateElseScalarEpilogue:
9732       return CM_ScalarEpilogueNotNeededUsePredicate;
9733     case PreferPredicateTy::PredicateOrDontVectorize:
9734       return CM_ScalarEpilogueNotAllowedUsePredicate;
9735     };
9736   }
9737 
9738   // 3) If set, obey the hints
9739   switch (Hints.getPredicate()) {
9740   case LoopVectorizeHints::FK_Enabled:
9741     return CM_ScalarEpilogueNotNeededUsePredicate;
9742   case LoopVectorizeHints::FK_Disabled:
9743     return CM_ScalarEpilogueAllowed;
9744   };
9745 
9746   // 4) if the TTI hook indicates this is profitable, request predication.
9747   if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, &LVL))
9748     return CM_ScalarEpilogueNotNeededUsePredicate;
9749 
9750   return CM_ScalarEpilogueAllowed;
9751 }
9752 
9753 Value *VPTransformState::get(VPValue *Def, unsigned Part) {
9754   // If Values have been set for this Def return the one relevant for \p Part.
9755   if (hasVectorValue(Def, Part))
9756     return Data.PerPartOutput[Def][Part];
9757 
9758   if (!hasScalarValue(Def, {Part, 0})) {
9759     Value *IRV = Def->getLiveInIRValue();
9760     Value *B = ILV->getBroadcastInstrs(IRV);
9761     set(Def, B, Part);
9762     return B;
9763   }
9764 
9765   Value *ScalarValue = get(Def, {Part, 0});
9766   // If we aren't vectorizing, we can just copy the scalar map values over
9767   // to the vector map.
9768   if (VF.isScalar()) {
9769     set(Def, ScalarValue, Part);
9770     return ScalarValue;
9771   }
9772 
9773   auto *RepR = dyn_cast<VPReplicateRecipe>(Def);
9774   bool IsUniform = RepR && RepR->isUniform();
9775 
9776   unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
9777   // Check if there is a scalar value for the selected lane.
9778   if (!hasScalarValue(Def, {Part, LastLane})) {
9779     // At the moment, VPWidenIntOrFpInductionRecipes can also be uniform.
9780     assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDef()) ||
9781             isa<VPScalarIVStepsRecipe>(Def->getDef())) &&
9782            "unexpected recipe found to be invariant");
9783     IsUniform = true;
9784     LastLane = 0;
9785   }
9786 
9787   auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
9788   // Set the insert point after the last scalarized instruction or after the
9789   // last PHI, if LastInst is a PHI. This ensures the insertelement sequence
9790   // will directly follow the scalar definitions.
9791   auto OldIP = Builder.saveIP();
9792   auto NewIP =
9793       isa<PHINode>(LastInst)
9794           ? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
9795           : std::next(BasicBlock::iterator(LastInst));
9796   Builder.SetInsertPoint(&*NewIP);
9797 
9798   // However, if we are vectorizing, we need to construct the vector values.
9799   // If the value is known to be uniform after vectorization, we can just
9800   // broadcast the scalar value corresponding to lane zero for each unroll
9801   // iteration. Otherwise, we construct the vector values using
9802   // insertelement instructions. Since the resulting vectors are stored in
9803   // State, we will only generate the insertelements once.
9804   Value *VectorValue = nullptr;
9805   if (IsUniform) {
9806     VectorValue = ILV->getBroadcastInstrs(ScalarValue);
9807     set(Def, VectorValue, Part);
9808   } else {
9809     // Initialize packing with insertelements to start from undef.
9810     assert(!VF.isScalable() && "VF is assumed to be non scalable.");
9811     Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
9812     set(Def, Undef, Part);
9813     for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
9814       ILV->packScalarIntoVectorValue(Def, {Part, Lane}, *this);
9815     VectorValue = get(Def, Part);
9816   }
9817   Builder.restoreIP(OldIP);
9818   return VectorValue;
9819 }
9820 
9821 // Process the loop in the VPlan-native vectorization path. This path builds
9822 // VPlan upfront in the vectorization pipeline, which allows to apply
9823 // VPlan-to-VPlan transformations from the very beginning without modifying the
9824 // input LLVM IR.
9825 static bool processLoopInVPlanNativePath(
9826     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9827     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9828     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9829     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9830     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9831     LoopVectorizationRequirements &Requirements) {
9832 
9833   if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
9834     LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9835     return false;
9836   }
9837   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9838   Function *F = L->getHeader()->getParent();
9839   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9840 
9841   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
9842       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
9843 
9844   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9845                                 &Hints, IAI);
9846   // Use the planner for outer loop vectorization.
9847   // TODO: CM is not used at this point inside the planner. Turn CM into an
9848   // optional argument if we don't need it in the future.
9849   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
9850 
9851   // Get user vectorization factor.
9852   ElementCount UserVF = Hints.getWidth();
9853 
9854   CM.collectElementTypesForWidening();
9855 
9856   // Plan how to best vectorize, return the best VF and its cost.
9857   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9858 
9859   // If we are stress testing VPlan builds, do not attempt to generate vector
9860   // code. Masked vector code generation support will follow soon.
9861   // Also, do not attempt to vectorize if no vector code will be produced.
9862   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9863     return false;
9864 
9865   VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
9866 
9867   {
9868     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9869                              F->getParent()->getDataLayout());
9870     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9871                            VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
9872     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9873                       << L->getHeader()->getParent()->getName() << "\"\n");
9874     LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
9875   }
9876 
9877   // Mark the loop as already vectorized to avoid vectorizing again.
9878   Hints.setAlreadyVectorized();
9879   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9880   return true;
9881 }
9882 
9883 // Emit a remark if there are stores to floats that required a floating point
9884 // extension. If the vectorized loop was generated with floating point there
9885 // will be a performance penalty from the conversion overhead and the change in
9886 // the vector width.
9887 static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9888   SmallVector<Instruction *, 4> Worklist;
9889   for (BasicBlock *BB : L->getBlocks()) {
9890     for (Instruction &Inst : *BB) {
9891       if (auto *S = dyn_cast<StoreInst>(&Inst)) {
9892         if (S->getValueOperand()->getType()->isFloatTy())
9893           Worklist.push_back(S);
9894       }
9895     }
9896   }
9897 
9898   // Traverse the floating point stores upwards searching, for floating point
9899   // conversions.
9900   SmallPtrSet<const Instruction *, 4> Visited;
9901   SmallPtrSet<const Instruction *, 4> EmittedRemark;
9902   while (!Worklist.empty()) {
9903     auto *I = Worklist.pop_back_val();
9904     if (!L->contains(I))
9905       continue;
9906     if (!Visited.insert(I).second)
9907       continue;
9908 
9909     // Emit a remark if the floating point store required a floating
9910     // point conversion.
9911     // TODO: More work could be done to identify the root cause such as a
9912     // constant or a function return type and point the user to it.
9913     if (isa<FPExtInst>(I) && EmittedRemark.insert(I).second)
9914       ORE->emit([&]() {
9915         return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9916                                           I->getDebugLoc(), L->getHeader())
9917                << "floating point conversion changes vector width. "
9918                << "Mixed floating point precision requires an up/down "
9919                << "cast that will negatively impact performance.";
9920       });
9921 
9922     for (Use &Op : I->operands())
9923       if (auto *OpI = dyn_cast<Instruction>(Op))
9924         Worklist.push_back(OpI);
9925   }
9926 }
9927 
9928 static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9929                                        VectorizationFactor &VF,
9930                                        Optional<unsigned> VScale, Loop *L,
9931                                        ScalarEvolution &SE) {
9932   InstructionCost CheckCost = Checks.getCost();
9933   if (!CheckCost.isValid())
9934     return false;
9935 
9936   // When interleaving only scalar and vector cost will be equal, which in turn
9937   // would lead to a divide by 0. Fall back to hard threshold.
9938   if (VF.Width.isScalar()) {
9939     if (CheckCost > VectorizeMemoryCheckThreshold) {
9940       LLVM_DEBUG(
9941           dbgs()
9942           << "LV: Interleaving only is not profitable due to runtime checks\n");
9943       return false;
9944     }
9945     return true;
9946   }
9947 
9948   // The scalar cost should only be 0 when vectorizing with a user specified VF/IC. In those cases, runtime checks should always be generated.
9949   double ScalarC = *VF.ScalarCost.getValue();
9950   if (ScalarC == 0)
9951     return true;
9952 
9953   // First, compute the minimum iteration count required so that the vector
9954   // loop outperforms the scalar loop.
9955   //  The total cost of the scalar loop is
9956   //   ScalarC * TC
9957   //  where
9958   //  * TC is the actual trip count of the loop.
9959   //  * ScalarC is the cost of a single scalar iteration.
9960   //
9961   //  The total cost of the vector loop is
9962   //    RtC + VecC * (TC / VF) + EpiC
9963   //  where
9964   //  * RtC is the cost of the generated runtime checks
9965   //  * VecC is the cost of a single vector iteration.
9966   //  * TC is the actual trip count of the loop
9967   //  * VF is the vectorization factor
9968   //  * EpiCost is the cost of the generated epilogue, including the cost
9969   //    of the remaining scalar operations.
9970   //
9971   // Vectorization is profitable once the total vector cost is less than the
9972   // total scalar cost:
9973   //   RtC + VecC * (TC / VF) + EpiC <  ScalarC * TC
9974   //
9975   // Now we can compute the minimum required trip count TC as
9976   //   (RtC + EpiC) / (ScalarC - (VecC / VF)) < TC
9977   //
9978   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9979   // the computations are performed on doubles, not integers and the result
9980   // is rounded up, hence we get an upper estimate of the TC.
9981   unsigned IntVF = VF.Width.getKnownMinValue();
9982   if (VF.Width.isScalable()) {
9983     unsigned AssumedMinimumVscale = 1;
9984     if (VScale)
9985       AssumedMinimumVscale = *VScale;
9986     IntVF *= AssumedMinimumVscale;
9987   }
9988   double VecCOverVF = double(*VF.Cost.getValue()) / IntVF;
9989   double RtC = *CheckCost.getValue();
9990   double MinTC1 = RtC / (ScalarC - VecCOverVF);
9991 
9992   // Second, compute a minimum iteration count so that the cost of the
9993   // runtime checks is only a fraction of the total scalar loop cost. This
9994   // adds a loop-dependent bound on the overhead incurred if the runtime
9995   // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9996   // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9997   // cost, compute
9998   //   RtC < ScalarC * TC * (1 / X)  ==>  RtC * X / ScalarC < TC
9999   double MinTC2 = RtC * 10 / ScalarC;
10000 
10001   // Now pick the larger minimum. If it is not a multiple of VF, choose the
10002   // next closest multiple of VF. This should partly compensate for ignoring
10003   // the epilogue cost.
10004   uint64_t MinTC = std::ceil(std::max(MinTC1, MinTC2));
10005   VF.MinProfitableTripCount = ElementCount::getFixed(alignTo(MinTC, IntVF));
10006 
10007   LLVM_DEBUG(
10008       dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
10009              << VF.MinProfitableTripCount << "\n");
10010 
10011   // Skip vectorization if the expected trip count is less than the minimum
10012   // required trip count.
10013   if (auto ExpectedTC = getSmallBestKnownTC(SE, L)) {
10014     if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
10015                                 VF.MinProfitableTripCount)) {
10016       LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
10017                            "trip count < minimum profitable VF ("
10018                         << *ExpectedTC << " < " << VF.MinProfitableTripCount
10019                         << ")\n");
10020 
10021       return false;
10022     }
10023   }
10024   return true;
10025 }
10026 
10027 LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
10028     : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
10029                                !EnableLoopInterleaving),
10030       VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
10031                               !EnableLoopVectorization) {}
10032 
10033 bool LoopVectorizePass::processLoop(Loop *L) {
10034   assert((EnableVPlanNativePath || L->isInnermost()) &&
10035          "VPlan-native path is not enabled. Only process inner loops.");
10036 
10037 #ifndef NDEBUG
10038   const std::string DebugLocStr = getDebugLocString(L);
10039 #endif /* NDEBUG */
10040 
10041   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
10042                     << L->getHeader()->getParent()->getName() << "' from "
10043                     << DebugLocStr << "\n");
10044 
10045   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
10046 
10047   LLVM_DEBUG(
10048       dbgs() << "LV: Loop hints:"
10049              << " force="
10050              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
10051                      ? "disabled"
10052                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
10053                             ? "enabled"
10054                             : "?"))
10055              << " width=" << Hints.getWidth()
10056              << " interleave=" << Hints.getInterleave() << "\n");
10057 
10058   // Function containing loop
10059   Function *F = L->getHeader()->getParent();
10060 
10061   // Looking at the diagnostic output is the only way to determine if a loop
10062   // was vectorized (other than looking at the IR or machine code), so it
10063   // is important to generate an optimization remark for each loop. Most of
10064   // these messages are generated as OptimizationRemarkAnalysis. Remarks
10065   // generated as OptimizationRemark and OptimizationRemarkMissed are
10066   // less verbose reporting vectorized loops and unvectorized loops that may
10067   // benefit from vectorization, respectively.
10068 
10069   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
10070     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
10071     return false;
10072   }
10073 
10074   PredicatedScalarEvolution PSE(*SE, *L);
10075 
10076   // Check if it is legal to vectorize the loop.
10077   LoopVectorizationRequirements Requirements;
10078   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
10079                                 &Requirements, &Hints, DB, AC, BFI, PSI);
10080   if (!LVL.canVectorize(EnableVPlanNativePath)) {
10081     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
10082     Hints.emitRemarkWithHints();
10083     return false;
10084   }
10085 
10086   // Check the function attributes and profiles to find out if this function
10087   // should be optimized for size.
10088   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
10089       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
10090 
10091   // Entrance to the VPlan-native vectorization path. Outer loops are processed
10092   // here. They may require CFG and instruction level transformations before
10093   // even evaluating whether vectorization is profitable. Since we cannot modify
10094   // the incoming IR, we need to build VPlan upfront in the vectorization
10095   // pipeline.
10096   if (!L->isInnermost())
10097     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
10098                                         ORE, BFI, PSI, Hints, Requirements);
10099 
10100   assert(L->isInnermost() && "Inner loop expected.");
10101 
10102   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10103   // count by optimizing for size, to minimize overheads.
10104   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
10105   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
10106     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10107                       << "This loop is worth vectorizing only if no scalar "
10108                       << "iteration overheads are incurred.");
10109     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10110       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10111     else {
10112       LLVM_DEBUG(dbgs() << "\n");
10113       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10114     }
10115   }
10116 
10117   // Check the function attributes to see if implicit floats are allowed.
10118   // FIXME: This check doesn't seem possibly correct -- what if the loop is
10119   // an integer loop and the vector instructions selected are purely integer
10120   // vector instructions?
10121   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
10122     reportVectorizationFailure(
10123         "Can't vectorize when the NoImplicitFloat attribute is used",
10124         "loop not vectorized due to NoImplicitFloat attribute",
10125         "NoImplicitFloat", ORE, L);
10126     Hints.emitRemarkWithHints();
10127     return false;
10128   }
10129 
10130   // Check if the target supports potentially unsafe FP vectorization.
10131   // FIXME: Add a check for the type of safety issue (denormal, signaling)
10132   // for the target we're vectorizing for, to make sure none of the
10133   // additional fp-math flags can help.
10134   if (Hints.isPotentiallyUnsafe() &&
10135       TTI->isFPVectorizationPotentiallyUnsafe()) {
10136     reportVectorizationFailure(
10137         "Potentially unsafe FP op prevents vectorization",
10138         "loop not vectorized due to unsafe FP support.",
10139         "UnsafeFP", ORE, L);
10140     Hints.emitRemarkWithHints();
10141     return false;
10142   }
10143 
10144   bool AllowOrderedReductions;
10145   // If the flag is set, use that instead and override the TTI behaviour.
10146   if (ForceOrderedReductions.getNumOccurrences() > 0)
10147     AllowOrderedReductions = ForceOrderedReductions;
10148   else
10149     AllowOrderedReductions = TTI->enableOrderedReductions();
10150   if (!LVL.canVectorizeFPMath(AllowOrderedReductions)) {
10151     ORE->emit([&]() {
10152       auto *ExactFPMathInst = Requirements.getExactFPInst();
10153       return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10154                                                  ExactFPMathInst->getDebugLoc(),
10155                                                  ExactFPMathInst->getParent())
10156              << "loop not vectorized: cannot prove it is safe to reorder "
10157                 "floating-point operations";
10158     });
10159     LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10160                          "reorder floating-point operations\n");
10161     Hints.emitRemarkWithHints();
10162     return false;
10163   }
10164 
10165   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
10166   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
10167 
10168   // If an override option has been passed in for interleaved accesses, use it.
10169   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
10170     UseInterleaved = EnableInterleavedMemAccesses;
10171 
10172   // Analyze interleaved memory accesses.
10173   if (UseInterleaved) {
10174     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
10175   }
10176 
10177   // Use the cost model.
10178   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10179                                 F, &Hints, IAI);
10180   CM.collectValuesToIgnore();
10181   CM.collectElementTypesForWidening();
10182 
10183   // Use the planner for vectorization.
10184   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
10185 
10186   // Get user vectorization factor and interleave count.
10187   ElementCount UserVF = Hints.getWidth();
10188   unsigned UserIC = Hints.getInterleave();
10189 
10190   // Plan how to best vectorize, return the best VF and its cost.
10191   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
10192 
10193   VectorizationFactor VF = VectorizationFactor::Disabled();
10194   unsigned IC = 1;
10195 
10196   GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
10197                            F->getParent()->getDataLayout());
10198   if (MaybeVF) {
10199     VF = *MaybeVF;
10200     // Select the interleave count.
10201     IC = CM.selectInterleaveCount(VF.Width, *VF.Cost.getValue());
10202 
10203     unsigned SelectedIC = std::max(IC, UserIC);
10204     //  Optimistically generate runtime checks if they are needed. Drop them if
10205     //  they turn out to not be profitable.
10206     if (VF.Width.isVector() || SelectedIC > 1)
10207       Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
10208 
10209     // Check if it is profitable to vectorize with runtime checks.
10210     bool ForceVectorization =
10211         Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10212     if (!ForceVectorization &&
10213         !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L,
10214                                     *PSE.getSE())) {
10215       ORE->emit([&]() {
10216         return OptimizationRemarkAnalysisAliasing(
10217                    DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10218                    L->getHeader())
10219                << "loop not vectorized: cannot prove it is safe to reorder "
10220                   "memory operations";
10221       });
10222       LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10223       Hints.emitRemarkWithHints();
10224       return false;
10225     }
10226   }
10227 
10228   // Identify the diagnostic messages that should be produced.
10229   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10230   bool VectorizeLoop = true, InterleaveLoop = true;
10231   if (VF.Width.isScalar()) {
10232     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10233     VecDiagMsg = std::make_pair(
10234         "VectorizationNotBeneficial",
10235         "the cost-model indicates that vectorization is not beneficial");
10236     VectorizeLoop = false;
10237   }
10238 
10239   if (!MaybeVF && UserIC > 1) {
10240     // Tell the user interleaving was avoided up-front, despite being explicitly
10241     // requested.
10242     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10243                          "interleaving should be avoided up front\n");
10244     IntDiagMsg = std::make_pair(
10245         "InterleavingAvoided",
10246         "Ignoring UserIC, because interleaving was avoided up front");
10247     InterleaveLoop = false;
10248   } else if (IC == 1 && UserIC <= 1) {
10249     // Tell the user interleaving is not beneficial.
10250     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10251     IntDiagMsg = std::make_pair(
10252         "InterleavingNotBeneficial",
10253         "the cost-model indicates that interleaving is not beneficial");
10254     InterleaveLoop = false;
10255     if (UserIC == 1) {
10256       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10257       IntDiagMsg.second +=
10258           " and is explicitly disabled or interleave count is set to 1";
10259     }
10260   } else if (IC > 1 && UserIC == 1) {
10261     // Tell the user interleaving is beneficial, but it explicitly disabled.
10262     LLVM_DEBUG(
10263         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10264     IntDiagMsg = std::make_pair(
10265         "InterleavingBeneficialButDisabled",
10266         "the cost-model indicates that interleaving is beneficial "
10267         "but is explicitly disabled or interleave count is set to 1");
10268     InterleaveLoop = false;
10269   }
10270 
10271   // Override IC if user provided an interleave count.
10272   IC = UserIC > 0 ? UserIC : IC;
10273 
10274   // Emit diagnostic messages, if any.
10275   const char *VAPassName = Hints.vectorizeAnalysisPassName();
10276   if (!VectorizeLoop && !InterleaveLoop) {
10277     // Do not vectorize or interleaving the loop.
10278     ORE->emit([&]() {
10279       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10280                                       L->getStartLoc(), L->getHeader())
10281              << VecDiagMsg.second;
10282     });
10283     ORE->emit([&]() {
10284       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10285                                       L->getStartLoc(), L->getHeader())
10286              << IntDiagMsg.second;
10287     });
10288     return false;
10289   } else if (!VectorizeLoop && InterleaveLoop) {
10290     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10291     ORE->emit([&]() {
10292       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10293                                         L->getStartLoc(), L->getHeader())
10294              << VecDiagMsg.second;
10295     });
10296   } else if (VectorizeLoop && !InterleaveLoop) {
10297     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10298                       << ") in " << DebugLocStr << '\n');
10299     ORE->emit([&]() {
10300       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10301                                         L->getStartLoc(), L->getHeader())
10302              << IntDiagMsg.second;
10303     });
10304   } else if (VectorizeLoop && InterleaveLoop) {
10305     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10306                       << ") in " << DebugLocStr << '\n');
10307     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10308   }
10309 
10310   bool DisableRuntimeUnroll = false;
10311   MDNode *OrigLoopID = L->getLoopID();
10312   {
10313     using namespace ore;
10314     if (!VectorizeLoop) {
10315       assert(IC > 1 && "interleave count should not be 1 or 0");
10316       // If we decided that it is not legal to vectorize the loop, then
10317       // interleave it.
10318       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
10319                                  &CM, BFI, PSI, Checks);
10320 
10321       VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10322       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
10323 
10324       ORE->emit([&]() {
10325         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10326                                   L->getHeader())
10327                << "interleaved loop (interleaved count: "
10328                << NV("InterleaveCount", IC) << ")";
10329       });
10330     } else {
10331       // If we decided that it is *legal* to vectorize the loop, then do it.
10332 
10333       // Consider vectorizing the epilogue too if it's profitable.
10334       VectorizationFactor EpilogueVF =
10335           CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
10336       if (EpilogueVF.Width.isVector()) {
10337 
10338         // The first pass vectorizes the main loop and creates a scalar epilogue
10339         // to be vectorized by executing the plan (potentially with a different
10340         // factor) again shortly afterwards.
10341         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
10342         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10343                                            EPI, &LVL, &CM, BFI, PSI, Checks);
10344 
10345         VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
10346         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
10347                         DT, true);
10348         ++LoopsVectorized;
10349 
10350         // Second pass vectorizes the epilogue and adjusts the control flow
10351         // edges from the first pass.
10352         EPI.MainLoopVF = EPI.EpilogueVF;
10353         EPI.MainLoopUF = EPI.EpilogueUF;
10354         EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10355                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
10356                                                  Checks);
10357 
10358         VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
10359         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
10360         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
10361         Header->setName("vec.epilog.vector.body");
10362 
10363         // Ensure that the start values for any VPReductionPHIRecipes are
10364         // updated before vectorising the epilogue loop.
10365         for (VPRecipeBase &R : Header->phis()) {
10366           if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(&R)) {
10367             if (auto *Resume = MainILV.getReductionResumeValue(
10368                     ReductionPhi->getRecurrenceDescriptor())) {
10369               VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(Resume);
10370               ReductionPhi->setOperand(0, StartVal);
10371             }
10372           }
10373         }
10374 
10375         LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10376                         DT, true);
10377         ++LoopsEpilogueVectorized;
10378 
10379         if (!MainILV.areSafetyChecksAdded())
10380           DisableRuntimeUnroll = true;
10381       } else {
10382         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10383                                VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
10384                                PSI, Checks);
10385 
10386         VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10387         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10388         ++LoopsVectorized;
10389 
10390         // Add metadata to disable runtime unrolling a scalar loop when there
10391         // are no runtime checks about strides and memory. A scalar loop that is
10392         // rarely used is not worth unrolling.
10393         if (!LB.areSafetyChecksAdded())
10394           DisableRuntimeUnroll = true;
10395       }
10396       // Report the vectorization decision.
10397       ORE->emit([&]() {
10398         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
10399                                   L->getHeader())
10400                << "vectorized loop (vectorization width: "
10401                << NV("VectorizationFactor", VF.Width)
10402                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
10403       });
10404     }
10405 
10406     if (ORE->allowExtraAnalysis(LV_NAME))
10407       checkMixedPrecision(L, ORE);
10408   }
10409 
10410   Optional<MDNode *> RemainderLoopID =
10411       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
10412                                       LLVMLoopVectorizeFollowupEpilogue});
10413   if (RemainderLoopID) {
10414     L->setLoopID(RemainderLoopID.value());
10415   } else {
10416     if (DisableRuntimeUnroll)
10417       AddRuntimeUnrollDisableMetaData(L);
10418 
10419     // Mark the loop as already vectorized to avoid vectorizing again.
10420     Hints.setAlreadyVectorized();
10421   }
10422 
10423   assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10424   return true;
10425 }
10426 
10427 LoopVectorizeResult LoopVectorizePass::runImpl(
10428     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
10429     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
10430     DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
10431     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
10432     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
10433   SE = &SE_;
10434   LI = &LI_;
10435   TTI = &TTI_;
10436   DT = &DT_;
10437   BFI = &BFI_;
10438   TLI = TLI_;
10439   AA = &AA_;
10440   AC = &AC_;
10441   GetLAA = &GetLAA_;
10442   DB = &DB_;
10443   ORE = &ORE_;
10444   PSI = PSI_;
10445 
10446   // Don't attempt if
10447   // 1. the target claims to have no vector registers, and
10448   // 2. interleaving won't help ILP.
10449   //
10450   // The second condition is necessary because, even if the target has no
10451   // vector registers, loop vectorization may still enable scalar
10452   // interleaving.
10453   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
10454       TTI->getMaxInterleaveFactor(1) < 2)
10455     return LoopVectorizeResult(false, false);
10456 
10457   bool Changed = false, CFGChanged = false;
10458 
10459   // The vectorizer requires loops to be in simplified form.
10460   // Since simplification may add new inner loops, it has to run before the
10461   // legality and profitability checks. This means running the loop vectorizer
10462   // will simplify all loops, regardless of whether anything end up being
10463   // vectorized.
10464   for (auto &L : *LI)
10465     Changed |= CFGChanged |=
10466         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
10467 
10468   // Build up a worklist of inner-loops to vectorize. This is necessary as
10469   // the act of vectorizing or partially unrolling a loop creates new loops
10470   // and can invalidate iterators across the loops.
10471   SmallVector<Loop *, 8> Worklist;
10472 
10473   for (Loop *L : *LI)
10474     collectSupportedLoops(*L, LI, ORE, Worklist);
10475 
10476   LoopsAnalyzed += Worklist.size();
10477 
10478   // Now walk the identified inner loops.
10479   while (!Worklist.empty()) {
10480     Loop *L = Worklist.pop_back_val();
10481 
10482     // For the inner loops we actually process, form LCSSA to simplify the
10483     // transform.
10484     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
10485 
10486     Changed |= CFGChanged |= processLoop(L);
10487   }
10488 
10489   // Process each loop nest in the function.
10490   return LoopVectorizeResult(Changed, CFGChanged);
10491 }
10492 
10493 PreservedAnalyses LoopVectorizePass::run(Function &F,
10494                                          FunctionAnalysisManager &AM) {
10495     auto &LI = AM.getResult<LoopAnalysis>(F);
10496     // There are no loops in the function. Return before computing other expensive
10497     // analyses.
10498     if (LI.empty())
10499       return PreservedAnalyses::all();
10500     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
10501     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
10502     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
10503     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
10504     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
10505     auto &AA = AM.getResult<AAManager>(F);
10506     auto &AC = AM.getResult<AssumptionAnalysis>(F);
10507     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
10508     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
10509 
10510     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
10511     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
10512         [&](Loop &L) -> const LoopAccessInfo & {
10513       LoopStandardAnalysisResults AR = {AA,  AC,  DT,      LI,      SE,
10514                                         TLI, TTI, nullptr, nullptr, nullptr};
10515       return LAM.getResult<LoopAccessAnalysis>(L, AR);
10516     };
10517     auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
10518     ProfileSummaryInfo *PSI =
10519         MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
10520     LoopVectorizeResult Result =
10521         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
10522     if (!Result.MadeAnyChange)
10523       return PreservedAnalyses::all();
10524     PreservedAnalyses PA;
10525 
10526     // We currently do not preserve loopinfo/dominator analyses with outer loop
10527     // vectorization. Until this is addressed, mark these analyses as preserved
10528     // only for non-VPlan-native path.
10529     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
10530     if (!EnableVPlanNativePath) {
10531       PA.preserve<LoopAnalysis>();
10532       PA.preserve<DominatorTreeAnalysis>();
10533     }
10534 
10535     if (Result.MadeCFGChange) {
10536       // Making CFG changes likely means a loop got vectorized. Indicate that
10537       // extra simplification passes should be run.
10538       // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10539       // be run if runtime checks have been added.
10540       AM.getResult<ShouldRunExtraVectorPasses>(F);
10541       PA.preserve<ShouldRunExtraVectorPasses>();
10542     } else {
10543       PA.preserveSet<CFGAnalyses>();
10544     }
10545     return PA;
10546 }
10547 
10548 void LoopVectorizePass::printPipeline(
10549     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10550   static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10551       OS, MapClassName2PassName);
10552 
10553   OS << "<";
10554   OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10555   OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10556   OS << ">";
10557 }
10558