1 //===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10 // and generates target-independent LLVM-IR.
11 // The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12 // of instructions in order to estimate the profitability of vectorization.
13 //
14 // The loop vectorizer combines consecutive loop iterations into a single
15 // 'wide' iteration. After this transformation the index is incremented
16 // by the SIMD vector width, and not by one.
17 //
18 // This pass has three parts:
19 // 1. The main loop pass that drives the different parts.
20 // 2. LoopVectorizationLegality - A unit that checks for the legality
21 //    of the vectorization.
22 // 3. InnerLoopVectorizer - A unit that performs the actual
23 //    widening of instructions.
24 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
25 //    of vectorization. It decides on the optimal vector width, which
26 //    can be one, if vectorization is not profitable.
27 //
28 // There is a development effort going on to migrate loop vectorizer to the
29 // VPlan infrastructure and to introduce outer loop vectorization support (see
30 // docs/Proposal/VectorizationPlan.rst and
31 // http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32 // purpose, we temporarily introduced the VPlan-native vectorization path: an
33 // alternative vectorization path that is natively implemented on top of the
34 // VPlan infrastructure. See EnableVPlanNativePath for enabling.
35 //
36 //===----------------------------------------------------------------------===//
37 //
38 // The reduction-variable vectorization is based on the paper:
39 //  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40 //
41 // Variable uniformity checks are inspired by:
42 //  Karrenberg, R. and Hack, S. Whole Function Vectorization.
43 //
44 // The interleaved access vectorization is based on the paper:
45 //  Dorit Nuzman, Ira Rosen and Ayal Zaks.  Auto-Vectorization of Interleaved
46 //  Data for SIMD
47 //
48 // Other ideas/concepts are from:
49 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50 //
51 //  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
52 //  Vectorizing Compilers.
53 //
54 //===----------------------------------------------------------------------===//
55 
56 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
57 #include "LoopVectorizationPlanner.h"
58 #include "VPRecipeBuilder.h"
59 #include "VPlan.h"
60 #include "VPlanHCFGBuilder.h"
61 #include "VPlanPredicator.h"
62 #include "VPlanTransforms.h"
63 #include "llvm/ADT/APInt.h"
64 #include "llvm/ADT/ArrayRef.h"
65 #include "llvm/ADT/DenseMap.h"
66 #include "llvm/ADT/DenseMapInfo.h"
67 #include "llvm/ADT/Hashing.h"
68 #include "llvm/ADT/MapVector.h"
69 #include "llvm/ADT/None.h"
70 #include "llvm/ADT/Optional.h"
71 #include "llvm/ADT/STLExtras.h"
72 #include "llvm/ADT/SetVector.h"
73 #include "llvm/ADT/SmallPtrSet.h"
74 #include "llvm/ADT/SmallVector.h"
75 #include "llvm/ADT/Statistic.h"
76 #include "llvm/ADT/StringRef.h"
77 #include "llvm/ADT/Twine.h"
78 #include "llvm/ADT/iterator_range.h"
79 #include "llvm/Analysis/AssumptionCache.h"
80 #include "llvm/Analysis/BasicAliasAnalysis.h"
81 #include "llvm/Analysis/BlockFrequencyInfo.h"
82 #include "llvm/Analysis/CFG.h"
83 #include "llvm/Analysis/CodeMetrics.h"
84 #include "llvm/Analysis/DemandedBits.h"
85 #include "llvm/Analysis/GlobalsModRef.h"
86 #include "llvm/Analysis/LoopAccessAnalysis.h"
87 #include "llvm/Analysis/LoopAnalysisManager.h"
88 #include "llvm/Analysis/LoopInfo.h"
89 #include "llvm/Analysis/LoopIterator.h"
90 #include "llvm/Analysis/MemorySSA.h"
91 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
92 #include "llvm/Analysis/ProfileSummaryInfo.h"
93 #include "llvm/Analysis/ScalarEvolution.h"
94 #include "llvm/Analysis/ScalarEvolutionExpander.h"
95 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
96 #include "llvm/Analysis/TargetLibraryInfo.h"
97 #include "llvm/Analysis/TargetTransformInfo.h"
98 #include "llvm/Analysis/VectorUtils.h"
99 #include "llvm/IR/Attributes.h"
100 #include "llvm/IR/BasicBlock.h"
101 #include "llvm/IR/CFG.h"
102 #include "llvm/IR/Constant.h"
103 #include "llvm/IR/Constants.h"
104 #include "llvm/IR/DataLayout.h"
105 #include "llvm/IR/DebugInfoMetadata.h"
106 #include "llvm/IR/DebugLoc.h"
107 #include "llvm/IR/DerivedTypes.h"
108 #include "llvm/IR/DiagnosticInfo.h"
109 #include "llvm/IR/Dominators.h"
110 #include "llvm/IR/Function.h"
111 #include "llvm/IR/IRBuilder.h"
112 #include "llvm/IR/InstrTypes.h"
113 #include "llvm/IR/Instruction.h"
114 #include "llvm/IR/Instructions.h"
115 #include "llvm/IR/IntrinsicInst.h"
116 #include "llvm/IR/Intrinsics.h"
117 #include "llvm/IR/LLVMContext.h"
118 #include "llvm/IR/Metadata.h"
119 #include "llvm/IR/Module.h"
120 #include "llvm/IR/Operator.h"
121 #include "llvm/IR/Type.h"
122 #include "llvm/IR/Use.h"
123 #include "llvm/IR/User.h"
124 #include "llvm/IR/Value.h"
125 #include "llvm/IR/ValueHandle.h"
126 #include "llvm/IR/Verifier.h"
127 #include "llvm/InitializePasses.h"
128 #include "llvm/Pass.h"
129 #include "llvm/Support/Casting.h"
130 #include "llvm/Support/CommandLine.h"
131 #include "llvm/Support/Compiler.h"
132 #include "llvm/Support/Debug.h"
133 #include "llvm/Support/ErrorHandling.h"
134 #include "llvm/Support/MathExtras.h"
135 #include "llvm/Support/raw_ostream.h"
136 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
137 #include "llvm/Transforms/Utils/LoopSimplify.h"
138 #include "llvm/Transforms/Utils/LoopUtils.h"
139 #include "llvm/Transforms/Utils/LoopVersioning.h"
140 #include "llvm/Transforms/Utils/SizeOpts.h"
141 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
142 #include <algorithm>
143 #include <cassert>
144 #include <cstdint>
145 #include <cstdlib>
146 #include <functional>
147 #include <iterator>
148 #include <limits>
149 #include <memory>
150 #include <string>
151 #include <tuple>
152 #include <utility>
153 
154 using namespace llvm;
155 
156 #define LV_NAME "loop-vectorize"
157 #define DEBUG_TYPE LV_NAME
158 
159 /// @{
160 /// Metadata attribute names
161 static const char *const LLVMLoopVectorizeFollowupAll =
162     "llvm.loop.vectorize.followup_all";
163 static const char *const LLVMLoopVectorizeFollowupVectorized =
164     "llvm.loop.vectorize.followup_vectorized";
165 static const char *const LLVMLoopVectorizeFollowupEpilogue =
166     "llvm.loop.vectorize.followup_epilogue";
167 /// @}
168 
169 STATISTIC(LoopsVectorized, "Number of loops vectorized");
170 STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
171 
172 /// Loops with a known constant trip count below this number are vectorized only
173 /// if no scalar iteration overheads are incurred.
174 static cl::opt<unsigned> TinyTripCountVectorThreshold(
175     "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
176     cl::desc("Loops with a constant trip count that is smaller than this "
177              "value are vectorized only if no scalar iteration overheads "
178              "are incurred."));
179 
180 // Indicates that an epilogue is undesired, predication is preferred.
181 // This means that the vectorizer will try to fold the loop-tail (epilogue)
182 // into the loop and predicate the loop body accordingly.
183 static cl::opt<bool> PreferPredicateOverEpilog(
184     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
185     cl::desc("Indicate that an epilogue is undesired, predication should be "
186              "used instead."));
187 
188 static cl::opt<bool> MaximizeBandwidth(
189     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
190     cl::desc("Maximize bandwidth when selecting vectorization factor which "
191              "will be determined by the smallest type in loop."));
192 
193 static cl::opt<bool> EnableInterleavedMemAccesses(
194     "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
195     cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
196 
197 /// An interleave-group may need masking if it resides in a block that needs
198 /// predication, or in order to mask away gaps.
199 static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
200     "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
201     cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
202 
203 static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
204     "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
205     cl::desc("We don't interleave loops with a estimated constant trip count "
206              "below this number"));
207 
208 static cl::opt<unsigned> ForceTargetNumScalarRegs(
209     "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
210     cl::desc("A flag that overrides the target's number of scalar registers."));
211 
212 static cl::opt<unsigned> ForceTargetNumVectorRegs(
213     "force-target-num-vector-regs", cl::init(0), cl::Hidden,
214     cl::desc("A flag that overrides the target's number of vector registers."));
215 
216 static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
217     "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
218     cl::desc("A flag that overrides the target's max interleave factor for "
219              "scalar loops."));
220 
221 static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
222     "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
223     cl::desc("A flag that overrides the target's max interleave factor for "
224              "vectorized loops."));
225 
226 static cl::opt<unsigned> ForceTargetInstructionCost(
227     "force-target-instruction-cost", cl::init(0), cl::Hidden,
228     cl::desc("A flag that overrides the target's expected cost for "
229              "an instruction to a single constant value. Mostly "
230              "useful for getting consistent testing."));
231 
232 static cl::opt<unsigned> SmallLoopCost(
233     "small-loop-cost", cl::init(20), cl::Hidden,
234     cl::desc(
235         "The cost of a loop that is considered 'small' by the interleaver."));
236 
237 static cl::opt<bool> LoopVectorizeWithBlockFrequency(
238     "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
239     cl::desc("Enable the use of the block frequency analysis to access PGO "
240              "heuristics minimizing code growth in cold regions and being more "
241              "aggressive in hot regions."));
242 
243 // Runtime interleave loops for load/store throughput.
244 static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
245     "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
246     cl::desc(
247         "Enable runtime interleaving until load/store ports are saturated"));
248 
249 /// The number of stores in a loop that are allowed to need predication.
250 static cl::opt<unsigned> NumberOfStoresToPredicate(
251     "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
252     cl::desc("Max number of stores to be predicated behind an if."));
253 
254 static cl::opt<bool> EnableIndVarRegisterHeur(
255     "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
256     cl::desc("Count the induction variable only once when interleaving"));
257 
258 static cl::opt<bool> EnableCondStoresVectorization(
259     "enable-cond-stores-vec", cl::init(true), cl::Hidden,
260     cl::desc("Enable if predication of stores during vectorization."));
261 
262 static cl::opt<unsigned> MaxNestedScalarReductionIC(
263     "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
264     cl::desc("The maximum interleave count to use when interleaving a scalar "
265              "reduction in a nested loop."));
266 
267 cl::opt<bool> EnableVPlanNativePath(
268     "enable-vplan-native-path", cl::init(false), cl::Hidden,
269     cl::desc("Enable VPlan-native vectorization path with "
270              "support for outer loop vectorization."));
271 
272 // FIXME: Remove this switch once we have divergence analysis. Currently we
273 // assume divergent non-backedge branches when this switch is true.
274 cl::opt<bool> EnableVPlanPredication(
275     "enable-vplan-predication", cl::init(false), cl::Hidden,
276     cl::desc("Enable VPlan-native vectorization path predicator with "
277              "support for outer loop vectorization."));
278 
279 // This flag enables the stress testing of the VPlan H-CFG construction in the
280 // VPlan-native vectorization path. It must be used in conjuction with
281 // -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
282 // verification of the H-CFGs built.
283 static cl::opt<bool> VPlanBuildStressTest(
284     "vplan-build-stress-test", cl::init(false), cl::Hidden,
285     cl::desc(
286         "Build VPlan for every supported loop nest in the function and bail "
287         "out right after the build (stress test the VPlan H-CFG construction "
288         "in the VPlan-native vectorization path)."));
289 
290 cl::opt<bool> llvm::EnableLoopInterleaving(
291     "interleave-loops", cl::init(true), cl::Hidden,
292     cl::desc("Enable loop interleaving in Loop vectorization passes"));
293 cl::opt<bool> llvm::EnableLoopVectorization(
294     "vectorize-loops", cl::init(true), cl::Hidden,
295     cl::desc("Run the Loop vectorization passes"));
296 
297 /// A helper function for converting Scalar types to vector types.
298 /// If the incoming type is void, we return void. If the VF is 1, we return
299 /// the scalar type.
ToVectorTy(Type * Scalar,unsigned VF)300 static Type *ToVectorTy(Type *Scalar, unsigned VF) {
301   if (Scalar->isVoidTy() || VF == 1)
302     return Scalar;
303   return VectorType::get(Scalar, VF);
304 }
305 
306 /// A helper function that returns the type of loaded or stored value.
getMemInstValueType(Value * I)307 static Type *getMemInstValueType(Value *I) {
308   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
309          "Expected Load or Store instruction");
310   if (auto *LI = dyn_cast<LoadInst>(I))
311     return LI->getType();
312   return cast<StoreInst>(I)->getValueOperand()->getType();
313 }
314 
315 /// A helper function that returns true if the given type is irregular. The
316 /// type is irregular if its allocated size doesn't equal the store size of an
317 /// element of the corresponding vector type at the given vectorization factor.
hasIrregularType(Type * Ty,const DataLayout & DL,unsigned VF)318 static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
319   // Determine if an array of VF elements of type Ty is "bitcast compatible"
320   // with a <VF x Ty> vector.
321   if (VF > 1) {
322     auto *VectorTy = VectorType::get(Ty, VF);
323     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
324   }
325 
326   // If the vectorization factor is one, we just check if an array of type Ty
327   // requires padding between elements.
328   return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
329 }
330 
331 /// A helper function that returns the reciprocal of the block probability of
332 /// predicated blocks. If we return X, we are assuming the predicated block
333 /// will execute once for every X iterations of the loop header.
334 ///
335 /// TODO: We should use actual block probability here, if available. Currently,
336 ///       we always assume predicated blocks have a 50% chance of executing.
getReciprocalPredBlockProb()337 static unsigned getReciprocalPredBlockProb() { return 2; }
338 
339 /// A helper function that adds a 'fast' flag to floating-point operations.
addFastMathFlag(Value * V)340 static Value *addFastMathFlag(Value *V) {
341   if (isa<FPMathOperator>(V))
342     cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
343   return V;
344 }
345 
addFastMathFlag(Value * V,FastMathFlags FMF)346 static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
347   if (isa<FPMathOperator>(V))
348     cast<Instruction>(V)->setFastMathFlags(FMF);
349   return V;
350 }
351 
352 /// A helper function that returns an integer or floating-point constant with
353 /// value C.
getSignedIntOrFpConstant(Type * Ty,int64_t C)354 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
355   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
356                            : ConstantFP::get(Ty, C);
357 }
358 
359 /// Returns "best known" trip count for the specified loop \p L as defined by
360 /// the following procedure:
361 ///   1) Returns exact trip count if it is known.
362 ///   2) Returns expected trip count according to profile data if any.
363 ///   3) Returns upper bound estimate if it is known.
364 ///   4) Returns None if all of the above failed.
getSmallBestKnownTC(ScalarEvolution & SE,Loop * L)365 static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
366   // Check if exact trip count is known.
367   if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
368     return ExpectedTC;
369 
370   // Check if there is an expected trip count available from profile data.
371   if (LoopVectorizeWithBlockFrequency)
372     if (auto EstimatedTC = getLoopEstimatedTripCount(L))
373       return EstimatedTC;
374 
375   // Check if upper bound estimate is known.
376   if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
377     return ExpectedTC;
378 
379   return None;
380 }
381 
382 namespace llvm {
383 
384 /// InnerLoopVectorizer vectorizes loops which contain only one basic
385 /// block to a specified vectorization factor (VF).
386 /// This class performs the widening of scalars into vectors, or multiple
387 /// scalars. This class also implements the following features:
388 /// * It inserts an epilogue loop for handling loops that don't have iteration
389 ///   counts that are known to be a multiple of the vectorization factor.
390 /// * It handles the code generation for reduction variables.
391 /// * Scalarization (implementation using scalars) of un-vectorizable
392 ///   instructions.
393 /// InnerLoopVectorizer does not perform any vectorization-legality
394 /// checks, and relies on the caller to check for the different legality
395 /// aspects. The InnerLoopVectorizer relies on the
396 /// LoopVectorizationLegality class to provide information about the induction
397 /// and reduction variables that were found to a given vectorization factor.
398 class InnerLoopVectorizer {
399 public:
InnerLoopVectorizer(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned VecWidth,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM)400   InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
401                       LoopInfo *LI, DominatorTree *DT,
402                       const TargetLibraryInfo *TLI,
403                       const TargetTransformInfo *TTI, AssumptionCache *AC,
404                       OptimizationRemarkEmitter *ORE, unsigned VecWidth,
405                       unsigned UnrollFactor, LoopVectorizationLegality *LVL,
406                       LoopVectorizationCostModel *CM)
407       : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
408         AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
409         Builder(PSE.getSE()->getContext()),
410         VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM) {}
411   virtual ~InnerLoopVectorizer() = default;
412 
413   /// Create a new empty loop. Unlink the old loop and connect the new one.
414   /// Return the pre-header block of the new loop.
415   BasicBlock *createVectorizedLoopSkeleton();
416 
417   /// Widen a single instruction within the innermost loop.
418   void widenInstruction(Instruction &I);
419 
420   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
421   void fixVectorizedLoop();
422 
423   // Return true if any runtime check is added.
areSafetyChecksAdded()424   bool areSafetyChecksAdded() { return AddedSafetyChecks; }
425 
426   /// A type for vectorized values in the new loop. Each value from the
427   /// original loop, when vectorized, is represented by UF vector values in the
428   /// new unrolled loop, where UF is the unroll factor.
429   using VectorParts = SmallVector<Value *, 2>;
430 
431   /// Vectorize a single GetElementPtrInst based on information gathered and
432   /// decisions taken during planning.
433   void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
434                 bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
435 
436   /// Vectorize a single PHINode in a block. This method handles the induction
437   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
438   /// arbitrary length vectors.
439   void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
440 
441   /// A helper function to scalarize a single Instruction in the innermost loop.
442   /// Generates a sequence of scalar instances for each lane between \p MinLane
443   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
444   /// inclusive..
445   void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
446                             bool IfPredicateInstr);
447 
448   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
449   /// is provided, the integer induction variable will first be truncated to
450   /// the corresponding type.
451   void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
452 
453   /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
454   /// vector or scalar value on-demand if one is not yet available. When
455   /// vectorizing a loop, we visit the definition of an instruction before its
456   /// uses. When visiting the definition, we either vectorize or scalarize the
457   /// instruction, creating an entry for it in the corresponding map. (In some
458   /// cases, such as induction variables, we will create both vector and scalar
459   /// entries.) Then, as we encounter uses of the definition, we derive values
460   /// for each scalar or vector use unless such a value is already available.
461   /// For example, if we scalarize a definition and one of its uses is vector,
462   /// we build the required vector on-demand with an insertelement sequence
463   /// when visiting the use. Otherwise, if the use is scalar, we can use the
464   /// existing scalar definition.
465   ///
466   /// Return a value in the new loop corresponding to \p V from the original
467   /// loop at unroll index \p Part. If the value has already been vectorized,
468   /// the corresponding vector entry in VectorLoopValueMap is returned. If,
469   /// however, the value has a scalar entry in VectorLoopValueMap, we construct
470   /// a new vector value on-demand by inserting the scalar values into a vector
471   /// with an insertelement sequence. If the value has been neither vectorized
472   /// nor scalarized, it must be loop invariant, so we simply broadcast the
473   /// value into a vector.
474   Value *getOrCreateVectorValue(Value *V, unsigned Part);
475 
476   /// Return a value in the new loop corresponding to \p V from the original
477   /// loop at unroll and vector indices \p Instance. If the value has been
478   /// vectorized but not scalarized, the necessary extractelement instruction
479   /// will be generated.
480   Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
481 
482   /// Construct the vector value of a scalarized value \p V one lane at a time.
483   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
484 
485   /// Try to vectorize the interleaved access group that \p Instr belongs to
486   /// with the base address given in \p Addr, optionally masking the vector
487   /// operations if \p BlockInMask is non-null. Use \p State to translate given
488   /// VPValues to IR values in the vectorized loop.
489   void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
490                                 VPValue *Addr, VPValue *BlockInMask = nullptr);
491 
492   /// Vectorize Load and Store instructions with the base address given in \p
493   /// Addr, optionally masking the vector operations if \p BlockInMask is
494   /// non-null. Use \p State to translate given VPValues to IR values in the
495   /// vectorized loop.
496   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
497                                   VPValue *Addr,
498                                   VPValue *BlockInMask = nullptr);
499 
500   /// Set the debug location in the builder using the debug location in
501   /// the instruction.
502   void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
503 
504   /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
505   void fixNonInductionPHIs(void);
506 
507 protected:
508   friend class LoopVectorizationPlanner;
509 
510   /// A small list of PHINodes.
511   using PhiVector = SmallVector<PHINode *, 4>;
512 
513   /// A type for scalarized values in the new loop. Each value from the
514   /// original loop, when scalarized, is represented by UF x VF scalar values
515   /// in the new unrolled loop, where UF is the unroll factor and VF is the
516   /// vectorization factor.
517   using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
518 
519   /// Set up the values of the IVs correctly when exiting the vector loop.
520   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
521                     Value *CountRoundDown, Value *EndValue,
522                     BasicBlock *MiddleBlock);
523 
524   /// Create a new induction variable inside L.
525   PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
526                                    Value *Step, Instruction *DL);
527 
528   /// Handle all cross-iteration phis in the header.
529   void fixCrossIterationPHIs();
530 
531   /// Fix a first-order recurrence. This is the second phase of vectorizing
532   /// this phi node.
533   void fixFirstOrderRecurrence(PHINode *Phi);
534 
535   /// Fix a reduction cross-iteration phi. This is the second phase of
536   /// vectorizing this phi node.
537   void fixReduction(PHINode *Phi);
538 
539   /// Clear NSW/NUW flags from reduction instructions if necessary.
540   void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
541 
542   /// The Loop exit block may have single value PHI nodes with some
543   /// incoming value. While vectorizing we only handled real values
544   /// that were defined inside the loop and we should have one value for
545   /// each predecessor of its parent basic block. See PR14725.
546   void fixLCSSAPHIs();
547 
548   /// Iteratively sink the scalarized operands of a predicated instruction into
549   /// the block that was created for it.
550   void sinkScalarOperands(Instruction *PredInst);
551 
552   /// Shrinks vector element sizes to the smallest bitwidth they can be legally
553   /// represented as.
554   void truncateToMinimalBitwidths();
555 
556   /// Create a broadcast instruction. This method generates a broadcast
557   /// instruction (shuffle) for loop invariant values and for the induction
558   /// value. If this is the induction variable then we extend it to N, N+1, ...
559   /// this is needed because each iteration in the loop corresponds to a SIMD
560   /// element.
561   virtual Value *getBroadcastInstrs(Value *V);
562 
563   /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
564   /// to each vector element of Val. The sequence starts at StartIndex.
565   /// \p Opcode is relevant for FP induction variable.
566   virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
567                                Instruction::BinaryOps Opcode =
568                                Instruction::BinaryOpsEnd);
569 
570   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
571   /// variable on which to base the steps, \p Step is the size of the step, and
572   /// \p EntryVal is the value from the original loop that maps to the steps.
573   /// Note that \p EntryVal doesn't have to be an induction variable - it
574   /// can also be a truncate instruction.
575   void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
576                         const InductionDescriptor &ID);
577 
578   /// Create a vector induction phi node based on an existing scalar one. \p
579   /// EntryVal is the value from the original loop that maps to the vector phi
580   /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
581   /// truncate instruction, instead of widening the original IV, we widen a
582   /// version of the IV truncated to \p EntryVal's type.
583   void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
584                                        Value *Step, Instruction *EntryVal);
585 
586   /// Returns true if an instruction \p I should be scalarized instead of
587   /// vectorized for the chosen vectorization factor.
588   bool shouldScalarizeInstruction(Instruction *I) const;
589 
590   /// Returns true if we should generate a scalar version of \p IV.
591   bool needsScalarInduction(Instruction *IV) const;
592 
593   /// If there is a cast involved in the induction variable \p ID, which should
594   /// be ignored in the vectorized loop body, this function records the
595   /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
596   /// cast. We had already proved that the casted Phi is equal to the uncasted
597   /// Phi in the vectorized loop (under a runtime guard), and therefore
598   /// there is no need to vectorize the cast - the same value can be used in the
599   /// vector loop for both the Phi and the cast.
600   /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
601   /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
602   ///
603   /// \p EntryVal is the value from the original loop that maps to the vector
604   /// phi node and is used to distinguish what is the IV currently being
605   /// processed - original one (if \p EntryVal is a phi corresponding to the
606   /// original IV) or the "newly-created" one based on the proof mentioned above
607   /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
608   /// latter case \p EntryVal is a TruncInst and we must not record anything for
609   /// that IV, but it's error-prone to expect callers of this routine to care
610   /// about that, hence this explicit parameter.
611   void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
612                                              const Instruction *EntryVal,
613                                              Value *VectorLoopValue,
614                                              unsigned Part,
615                                              unsigned Lane = UINT_MAX);
616 
617   /// Generate a shuffle sequence that will reverse the vector Vec.
618   virtual Value *reverseVector(Value *Vec);
619 
620   /// Returns (and creates if needed) the original loop trip count.
621   Value *getOrCreateTripCount(Loop *NewLoop);
622 
623   /// Returns (and creates if needed) the trip count of the widened loop.
624   Value *getOrCreateVectorTripCount(Loop *NewLoop);
625 
626   /// Returns a bitcasted value to the requested vector type.
627   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
628   Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
629                                 const DataLayout &DL);
630 
631   /// Emit a bypass check to see if the vector trip count is zero, including if
632   /// it overflows.
633   void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
634 
635   /// Emit a bypass check to see if all of the SCEV assumptions we've
636   /// had to make are correct.
637   void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
638 
639   /// Emit bypass checks to check any memory assumptions we may have made.
640   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
641 
642   /// Compute the transformed value of Index at offset StartValue using step
643   /// StepValue.
644   /// For integer induction, returns StartValue + Index * StepValue.
645   /// For pointer induction, returns StartValue[Index * StepValue].
646   /// FIXME: The newly created binary instructions should contain nsw/nuw
647   /// flags, which can be found from the original scalar operations.
648   Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
649                               const DataLayout &DL,
650                               const InductionDescriptor &ID) const;
651 
652   /// Add additional metadata to \p To that was not present on \p Orig.
653   ///
654   /// Currently this is used to add the noalias annotations based on the
655   /// inserted memchecks.  Use this for instructions that are *cloned* into the
656   /// vector loop.
657   void addNewMetadata(Instruction *To, const Instruction *Orig);
658 
659   /// Add metadata from one instruction to another.
660   ///
661   /// This includes both the original MDs from \p From and additional ones (\see
662   /// addNewMetadata).  Use this for *newly created* instructions in the vector
663   /// loop.
664   void addMetadata(Instruction *To, Instruction *From);
665 
666   /// Similar to the previous function but it adds the metadata to a
667   /// vector of instructions.
668   void addMetadata(ArrayRef<Value *> To, Instruction *From);
669 
670   /// The original loop.
671   Loop *OrigLoop;
672 
673   /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
674   /// dynamic knowledge to simplify SCEV expressions and converts them to a
675   /// more usable form.
676   PredicatedScalarEvolution &PSE;
677 
678   /// Loop Info.
679   LoopInfo *LI;
680 
681   /// Dominator Tree.
682   DominatorTree *DT;
683 
684   /// Alias Analysis.
685   AliasAnalysis *AA;
686 
687   /// Target Library Info.
688   const TargetLibraryInfo *TLI;
689 
690   /// Target Transform Info.
691   const TargetTransformInfo *TTI;
692 
693   /// Assumption Cache.
694   AssumptionCache *AC;
695 
696   /// Interface to emit optimization remarks.
697   OptimizationRemarkEmitter *ORE;
698 
699   /// LoopVersioning.  It's only set up (non-null) if memchecks were
700   /// used.
701   ///
702   /// This is currently only used to add no-alias metadata based on the
703   /// memchecks.  The actually versioning is performed manually.
704   std::unique_ptr<LoopVersioning> LVer;
705 
706   /// The vectorization SIMD factor to use. Each vector will have this many
707   /// vector elements.
708   unsigned VF;
709 
710   /// The vectorization unroll factor to use. Each scalar is vectorized to this
711   /// many different vector instructions.
712   unsigned UF;
713 
714   /// The builder that we use
715   IRBuilder<> Builder;
716 
717   // --- Vectorization state ---
718 
719   /// The vector-loop preheader.
720   BasicBlock *LoopVectorPreHeader;
721 
722   /// The scalar-loop preheader.
723   BasicBlock *LoopScalarPreHeader;
724 
725   /// Middle Block between the vector and the scalar.
726   BasicBlock *LoopMiddleBlock;
727 
728   /// The ExitBlock of the scalar loop.
729   BasicBlock *LoopExitBlock;
730 
731   /// The vector loop body.
732   BasicBlock *LoopVectorBody;
733 
734   /// The scalar loop body.
735   BasicBlock *LoopScalarBody;
736 
737   /// A list of all bypass blocks. The first block is the entry of the loop.
738   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
739 
740   /// The new Induction variable which was added to the new block.
741   PHINode *Induction = nullptr;
742 
743   /// The induction variable of the old basic block.
744   PHINode *OldInduction = nullptr;
745 
746   /// Maps values from the original loop to their corresponding values in the
747   /// vectorized loop. A key value can map to either vector values, scalar
748   /// values or both kinds of values, depending on whether the key was
749   /// vectorized and scalarized.
750   VectorizerValueMap VectorLoopValueMap;
751 
752   /// Store instructions that were predicated.
753   SmallVector<Instruction *, 4> PredicatedInstructions;
754 
755   /// Trip count of the original loop.
756   Value *TripCount = nullptr;
757 
758   /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
759   Value *VectorTripCount = nullptr;
760 
761   /// The legality analysis.
762   LoopVectorizationLegality *Legal;
763 
764   /// The profitablity analysis.
765   LoopVectorizationCostModel *Cost;
766 
767   // Record whether runtime checks are added.
768   bool AddedSafetyChecks = false;
769 
770   // Holds the end values for each induction variable. We save the end values
771   // so we can later fix-up the external users of the induction variables.
772   DenseMap<PHINode *, Value *> IVEndValues;
773 
774   // Vector of original scalar PHIs whose corresponding widened PHIs need to be
775   // fixed up at the end of vector code generation.
776   SmallVector<PHINode *, 8> OrigPHIsToFix;
777 };
778 
779 class InnerLoopUnroller : public InnerLoopVectorizer {
780 public:
InnerLoopUnroller(Loop * OrigLoop,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,const TargetLibraryInfo * TLI,const TargetTransformInfo * TTI,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,unsigned UnrollFactor,LoopVectorizationLegality * LVL,LoopVectorizationCostModel * CM)781   InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
782                     LoopInfo *LI, DominatorTree *DT,
783                     const TargetLibraryInfo *TLI,
784                     const TargetTransformInfo *TTI, AssumptionCache *AC,
785                     OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
786                     LoopVectorizationLegality *LVL,
787                     LoopVectorizationCostModel *CM)
788       : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE, 1,
789                             UnrollFactor, LVL, CM) {}
790 
791 private:
792   Value *getBroadcastInstrs(Value *V) override;
793   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
794                        Instruction::BinaryOps Opcode =
795                        Instruction::BinaryOpsEnd) override;
796   Value *reverseVector(Value *Vec) override;
797 };
798 
799 } // end namespace llvm
800 
801 /// Look for a meaningful debug location on the instruction or it's
802 /// operands.
getDebugLocFromInstOrOperands(Instruction * I)803 static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
804   if (!I)
805     return I;
806 
807   DebugLoc Empty;
808   if (I->getDebugLoc() != Empty)
809     return I;
810 
811   for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
812     if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
813       if (OpInst->getDebugLoc() != Empty)
814         return OpInst;
815   }
816 
817   return I;
818 }
819 
setDebugLocFromInst(IRBuilder<> & B,const Value * Ptr)820 void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
821   if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
822     const DILocation *DIL = Inst->getDebugLoc();
823     if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
824         !isa<DbgInfoIntrinsic>(Inst)) {
825       auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(UF * VF);
826       if (NewDIL)
827         B.SetCurrentDebugLocation(NewDIL.getValue());
828       else
829         LLVM_DEBUG(dbgs()
830                    << "Failed to create new discriminator: "
831                    << DIL->getFilename() << " Line: " << DIL->getLine());
832     }
833     else
834       B.SetCurrentDebugLocation(DIL);
835   } else
836     B.SetCurrentDebugLocation(DebugLoc());
837 }
838 
839 /// Write a record \p DebugMsg about vectorization failure to the debug
840 /// output stream. If \p I is passed, it is an instruction that prevents
841 /// vectorization.
842 #ifndef NDEBUG
debugVectorizationFailure(const StringRef DebugMsg,Instruction * I)843 static void debugVectorizationFailure(const StringRef DebugMsg,
844     Instruction *I) {
845   dbgs() << "LV: Not vectorizing: " << DebugMsg;
846   if (I != nullptr)
847     dbgs() << " " << *I;
848   else
849     dbgs() << '.';
850   dbgs() << '\n';
851 }
852 #endif
853 
854 /// Create an analysis remark that explains why vectorization failed
855 ///
856 /// \p PassName is the name of the pass (e.g. can be AlwaysPrint).  \p
857 /// RemarkName is the identifier for the remark.  If \p I is passed it is an
858 /// instruction that prevents vectorization.  Otherwise \p TheLoop is used for
859 /// the location of the remark.  \return the remark object that can be
860 /// streamed to.
createLVAnalysis(const char * PassName,StringRef RemarkName,Loop * TheLoop,Instruction * I)861 static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
862     StringRef RemarkName, Loop *TheLoop, Instruction *I) {
863   Value *CodeRegion = TheLoop->getHeader();
864   DebugLoc DL = TheLoop->getStartLoc();
865 
866   if (I) {
867     CodeRegion = I->getParent();
868     // If there is no debug location attached to the instruction, revert back to
869     // using the loop's.
870     if (I->getDebugLoc())
871       DL = I->getDebugLoc();
872   }
873 
874   OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
875   R << "loop not vectorized: ";
876   return R;
877 }
878 
879 namespace llvm {
880 
reportVectorizationFailure(const StringRef DebugMsg,const StringRef OREMsg,const StringRef ORETag,OptimizationRemarkEmitter * ORE,Loop * TheLoop,Instruction * I)881 void reportVectorizationFailure(const StringRef DebugMsg,
882     const StringRef OREMsg, const StringRef ORETag,
883     OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
884   LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
885   LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
886   ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
887                 ORETag, TheLoop, I) << OREMsg);
888 }
889 
890 } // end namespace llvm
891 
892 #ifndef NDEBUG
893 /// \return string containing a file name and a line # for the given loop.
getDebugLocString(const Loop * L)894 static std::string getDebugLocString(const Loop *L) {
895   std::string Result;
896   if (L) {
897     raw_string_ostream OS(Result);
898     if (const DebugLoc LoopDbgLoc = L->getStartLoc())
899       LoopDbgLoc.print(OS);
900     else
901       // Just print the module name.
902       OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
903     OS.flush();
904   }
905   return Result;
906 }
907 #endif
908 
addNewMetadata(Instruction * To,const Instruction * Orig)909 void InnerLoopVectorizer::addNewMetadata(Instruction *To,
910                                          const Instruction *Orig) {
911   // If the loop was versioned with memchecks, add the corresponding no-alias
912   // metadata.
913   if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
914     LVer->annotateInstWithNoAlias(To, Orig);
915 }
916 
addMetadata(Instruction * To,Instruction * From)917 void InnerLoopVectorizer::addMetadata(Instruction *To,
918                                       Instruction *From) {
919   propagateMetadata(To, From);
920   addNewMetadata(To, From);
921 }
922 
addMetadata(ArrayRef<Value * > To,Instruction * From)923 void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
924                                       Instruction *From) {
925   for (Value *V : To) {
926     if (Instruction *I = dyn_cast<Instruction>(V))
927       addMetadata(I, From);
928   }
929 }
930 
931 namespace llvm {
932 
933 // Loop vectorization cost-model hints how the scalar epilogue loop should be
934 // lowered.
935 enum ScalarEpilogueLowering {
936 
937   // The default: allowing scalar epilogues.
938   CM_ScalarEpilogueAllowed,
939 
940   // Vectorization with OptForSize: don't allow epilogues.
941   CM_ScalarEpilogueNotAllowedOptSize,
942 
943   // A special case of vectorisation with OptForSize: loops with a very small
944   // trip count are considered for vectorization under OptForSize, thereby
945   // making sure the cost of their loop body is dominant, free of runtime
946   // guards and scalar iteration overheads.
947   CM_ScalarEpilogueNotAllowedLowTripLoop,
948 
949   // Loop hint predicate indicating an epilogue is undesired.
950   CM_ScalarEpilogueNotNeededUsePredicate
951 };
952 
953 /// LoopVectorizationCostModel - estimates the expected speedups due to
954 /// vectorization.
955 /// In many cases vectorization is not profitable. This can happen because of
956 /// a number of reasons. In this class we mainly attempt to predict the
957 /// expected speedup/slowdowns due to the supported instruction set. We use the
958 /// TargetTransformInfo to query the different backends for the cost of
959 /// different operations.
960 class LoopVectorizationCostModel {
961 public:
LoopVectorizationCostModel(ScalarEpilogueLowering SEL,Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,LoopVectorizationLegality * Legal,const TargetTransformInfo & TTI,const TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,const Function * F,const LoopVectorizeHints * Hints,InterleavedAccessInfo & IAI)962   LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
963                              PredicatedScalarEvolution &PSE, LoopInfo *LI,
964                              LoopVectorizationLegality *Legal,
965                              const TargetTransformInfo &TTI,
966                              const TargetLibraryInfo *TLI, DemandedBits *DB,
967                              AssumptionCache *AC,
968                              OptimizationRemarkEmitter *ORE, const Function *F,
969                              const LoopVectorizeHints *Hints,
970                              InterleavedAccessInfo &IAI)
971       : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
972         TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
973         Hints(Hints), InterleaveInfo(IAI) {}
974 
975   /// \return An upper bound for the vectorization factor, or None if
976   /// vectorization and interleaving should be avoided up front.
977   Optional<unsigned> computeMaxVF();
978 
979   /// \return True if runtime checks are required for vectorization, and false
980   /// otherwise.
981   bool runtimeChecksRequired();
982 
983   /// \return The most profitable vectorization factor and the cost of that VF.
984   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
985   /// then this vectorization factor will be selected if vectorization is
986   /// possible.
987   VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
988 
989   /// Setup cost-based decisions for user vectorization factor.
selectUserVectorizationFactor(unsigned UserVF)990   void selectUserVectorizationFactor(unsigned UserVF) {
991     collectUniformsAndScalars(UserVF);
992     collectInstsToScalarize(UserVF);
993   }
994 
995   /// \return The size (in bits) of the smallest and widest types in the code
996   /// that needs to be vectorized. We ignore values that remain scalar such as
997   /// 64 bit loop indices.
998   std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
999 
1000   /// \return The desired interleave count.
1001   /// If interleave count has been specified by metadata it will be returned.
1002   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
1003   /// are the selected vectorization factor and the cost of the selected VF.
1004   unsigned selectInterleaveCount(unsigned VF, unsigned LoopCost);
1005 
1006   /// Memory access instruction may be vectorized in more than one way.
1007   /// Form of instruction after vectorization depends on cost.
1008   /// This function takes cost-based decisions for Load/Store instructions
1009   /// and collects them in a map. This decisions map is used for building
1010   /// the lists of loop-uniform and loop-scalar instructions.
1011   /// The calculated cost is saved with widening decision in order to
1012   /// avoid redundant calculations.
1013   void setCostBasedWideningDecision(unsigned VF);
1014 
1015   /// A struct that represents some properties of the register usage
1016   /// of a loop.
1017   struct RegisterUsage {
1018     /// Holds the number of loop invariant values that are used in the loop.
1019     /// The key is ClassID of target-provided register class.
1020     SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
1021     /// Holds the maximum number of concurrent live intervals in the loop.
1022     /// The key is ClassID of target-provided register class.
1023     SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
1024   };
1025 
1026   /// \return Returns information about the register usages of the loop for the
1027   /// given vectorization factors.
1028   SmallVector<RegisterUsage, 8> calculateRegisterUsage(ArrayRef<unsigned> VFs);
1029 
1030   /// Collect values we want to ignore in the cost model.
1031   void collectValuesToIgnore();
1032 
1033   /// \returns The smallest bitwidth each instruction can be represented with.
1034   /// The vector equivalents of these instructions should be truncated to this
1035   /// type.
getMinimalBitwidths() const1036   const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1037     return MinBWs;
1038   }
1039 
1040   /// \returns True if it is more profitable to scalarize instruction \p I for
1041   /// vectorization factor \p VF.
isProfitableToScalarize(Instruction * I,unsigned VF) const1042   bool isProfitableToScalarize(Instruction *I, unsigned VF) const {
1043     assert(VF > 1 && "Profitable to scalarize relevant only for VF > 1.");
1044 
1045     // Cost model is not run in the VPlan-native path - return conservative
1046     // result until this changes.
1047     if (EnableVPlanNativePath)
1048       return false;
1049 
1050     auto Scalars = InstsToScalarize.find(VF);
1051     assert(Scalars != InstsToScalarize.end() &&
1052            "VF not yet analyzed for scalarization profitability");
1053     return Scalars->second.find(I) != Scalars->second.end();
1054   }
1055 
1056   /// Returns true if \p I is known to be uniform after vectorization.
isUniformAfterVectorization(Instruction * I,unsigned VF) const1057   bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
1058     if (VF == 1)
1059       return true;
1060 
1061     // Cost model is not run in the VPlan-native path - return conservative
1062     // result until this changes.
1063     if (EnableVPlanNativePath)
1064       return false;
1065 
1066     auto UniformsPerVF = Uniforms.find(VF);
1067     assert(UniformsPerVF != Uniforms.end() &&
1068            "VF not yet analyzed for uniformity");
1069     return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
1070   }
1071 
1072   /// Returns true if \p I is known to be scalar after vectorization.
isScalarAfterVectorization(Instruction * I,unsigned VF) const1073   bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
1074     if (VF == 1)
1075       return true;
1076 
1077     // Cost model is not run in the VPlan-native path - return conservative
1078     // result until this changes.
1079     if (EnableVPlanNativePath)
1080       return false;
1081 
1082     auto ScalarsPerVF = Scalars.find(VF);
1083     assert(ScalarsPerVF != Scalars.end() &&
1084            "Scalar values are not calculated for VF");
1085     return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
1086   }
1087 
1088   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1089   /// for vectorization factor \p VF.
canTruncateToMinimalBitwidth(Instruction * I,unsigned VF) const1090   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
1091     return VF > 1 && MinBWs.find(I) != MinBWs.end() &&
1092            !isProfitableToScalarize(I, VF) &&
1093            !isScalarAfterVectorization(I, VF);
1094   }
1095 
1096   /// Decision that was taken during cost calculation for memory instruction.
1097   enum InstWidening {
1098     CM_Unknown,
1099     CM_Widen,         // For consecutive accesses with stride +1.
1100     CM_Widen_Reverse, // For consecutive accesses with stride -1.
1101     CM_Interleave,
1102     CM_GatherScatter,
1103     CM_Scalarize
1104   };
1105 
1106   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1107   /// instruction \p I and vector width \p VF.
setWideningDecision(Instruction * I,unsigned VF,InstWidening W,unsigned Cost)1108   void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
1109                            unsigned Cost) {
1110     assert(VF >= 2 && "Expected VF >=2");
1111     WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1112   }
1113 
1114   /// Save vectorization decision \p W and \p Cost taken by the cost model for
1115   /// interleaving group \p Grp and vector width \p VF.
setWideningDecision(const InterleaveGroup<Instruction> * Grp,unsigned VF,InstWidening W,unsigned Cost)1116   void setWideningDecision(const InterleaveGroup<Instruction> *Grp, unsigned VF,
1117                            InstWidening W, unsigned Cost) {
1118     assert(VF >= 2 && "Expected VF >=2");
1119     /// Broadcast this decicion to all instructions inside the group.
1120     /// But the cost will be assigned to one instruction only.
1121     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
1122       if (auto *I = Grp->getMember(i)) {
1123         if (Grp->getInsertPos() == I)
1124           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
1125         else
1126           WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
1127       }
1128     }
1129   }
1130 
1131   /// Return the cost model decision for the given instruction \p I and vector
1132   /// width \p VF. Return CM_Unknown if this instruction did not pass
1133   /// through the cost modeling.
getWideningDecision(Instruction * I,unsigned VF)1134   InstWidening getWideningDecision(Instruction *I, unsigned VF) {
1135     assert(VF >= 2 && "Expected VF >=2");
1136 
1137     // Cost model is not run in the VPlan-native path - return conservative
1138     // result until this changes.
1139     if (EnableVPlanNativePath)
1140       return CM_GatherScatter;
1141 
1142     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1143     auto Itr = WideningDecisions.find(InstOnVF);
1144     if (Itr == WideningDecisions.end())
1145       return CM_Unknown;
1146     return Itr->second.first;
1147   }
1148 
1149   /// Return the vectorization cost for the given instruction \p I and vector
1150   /// width \p VF.
getWideningCost(Instruction * I,unsigned VF)1151   unsigned getWideningCost(Instruction *I, unsigned VF) {
1152     assert(VF >= 2 && "Expected VF >=2");
1153     std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
1154     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
1155            "The cost is not calculated");
1156     return WideningDecisions[InstOnVF].second;
1157   }
1158 
1159   /// Return True if instruction \p I is an optimizable truncate whose operand
1160   /// is an induction variable. Such a truncate will be removed by adding a new
1161   /// induction variable with the destination type.
isOptimizableIVTruncate(Instruction * I,unsigned VF)1162   bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
1163     // If the instruction is not a truncate, return false.
1164     auto *Trunc = dyn_cast<TruncInst>(I);
1165     if (!Trunc)
1166       return false;
1167 
1168     // Get the source and destination types of the truncate.
1169     Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
1170     Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
1171 
1172     // If the truncate is free for the given types, return false. Replacing a
1173     // free truncate with an induction variable would add an induction variable
1174     // update instruction to each iteration of the loop. We exclude from this
1175     // check the primary induction variable since it will need an update
1176     // instruction regardless.
1177     Value *Op = Trunc->getOperand(0);
1178     if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
1179       return false;
1180 
1181     // If the truncated value is not an induction variable, return false.
1182     return Legal->isInductionPhi(Op);
1183   }
1184 
1185   /// Collects the instructions to scalarize for each predicated instruction in
1186   /// the loop.
1187   void collectInstsToScalarize(unsigned VF);
1188 
1189   /// Collect Uniform and Scalar values for the given \p VF.
1190   /// The sets depend on CM decision for Load/Store instructions
1191   /// that may be vectorized as interleave, gather-scatter or scalarized.
collectUniformsAndScalars(unsigned VF)1192   void collectUniformsAndScalars(unsigned VF) {
1193     // Do the analysis once.
1194     if (VF == 1 || Uniforms.find(VF) != Uniforms.end())
1195       return;
1196     setCostBasedWideningDecision(VF);
1197     collectLoopUniforms(VF);
1198     collectLoopScalars(VF);
1199   }
1200 
1201   /// Returns true if the target machine supports masked store operation
1202   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedStore(Type * DataType,Value * Ptr,MaybeAlign Alignment)1203   bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1204     return Legal->isConsecutivePtr(Ptr) &&
1205            TTI.isLegalMaskedStore(DataType, Alignment);
1206   }
1207 
1208   /// Returns true if the target machine supports masked load operation
1209   /// for the given \p DataType and kind of access to \p Ptr.
isLegalMaskedLoad(Type * DataType,Value * Ptr,MaybeAlign Alignment)1210   bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
1211     return Legal->isConsecutivePtr(Ptr) &&
1212            TTI.isLegalMaskedLoad(DataType, Alignment);
1213   }
1214 
1215   /// Returns true if the target machine supports masked scatter operation
1216   /// for the given \p DataType.
isLegalMaskedScatter(Type * DataType,MaybeAlign Alignment)1217   bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
1218     return TTI.isLegalMaskedScatter(DataType, Alignment);
1219   }
1220 
1221   /// Returns true if the target machine supports masked gather operation
1222   /// for the given \p DataType.
isLegalMaskedGather(Type * DataType,MaybeAlign Alignment)1223   bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
1224     return TTI.isLegalMaskedGather(DataType, Alignment);
1225   }
1226 
1227   /// Returns true if the target machine can represent \p V as a masked gather
1228   /// or scatter operation.
isLegalGatherOrScatter(Value * V)1229   bool isLegalGatherOrScatter(Value *V) {
1230     bool LI = isa<LoadInst>(V);
1231     bool SI = isa<StoreInst>(V);
1232     if (!LI && !SI)
1233       return false;
1234     auto *Ty = getMemInstValueType(V);
1235     MaybeAlign Align = getLoadStoreAlignment(V);
1236     return (LI && isLegalMaskedGather(Ty, Align)) ||
1237            (SI && isLegalMaskedScatter(Ty, Align));
1238   }
1239 
1240   /// Returns true if \p I is an instruction that will be scalarized with
1241   /// predication. Such instructions include conditional stores and
1242   /// instructions that may divide by zero.
1243   /// If a non-zero VF has been calculated, we check if I will be scalarized
1244   /// predication for that VF.
1245   bool isScalarWithPredication(Instruction *I, unsigned VF = 1);
1246 
1247   // Returns true if \p I is an instruction that will be predicated either
1248   // through scalar predication or masked load/store or masked gather/scatter.
1249   // Superset of instructions that return true for isScalarWithPredication.
isPredicatedInst(Instruction * I)1250   bool isPredicatedInst(Instruction *I) {
1251     if (!blockNeedsPredication(I->getParent()))
1252       return false;
1253     // Loads and stores that need some form of masked operation are predicated
1254     // instructions.
1255     if (isa<LoadInst>(I) || isa<StoreInst>(I))
1256       return Legal->isMaskRequired(I);
1257     return isScalarWithPredication(I);
1258   }
1259 
1260   /// Returns true if \p I is a memory instruction with consecutive memory
1261   /// access that can be widened.
1262   bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
1263 
1264   /// Returns true if \p I is a memory instruction in an interleaved-group
1265   /// of memory accesses that can be vectorized with wide vector loads/stores
1266   /// and shuffles.
1267   bool interleavedAccessCanBeWidened(Instruction *I, unsigned VF = 1);
1268 
1269   /// Check if \p Instr belongs to any interleaved access group.
isAccessInterleaved(Instruction * Instr)1270   bool isAccessInterleaved(Instruction *Instr) {
1271     return InterleaveInfo.isInterleaved(Instr);
1272   }
1273 
1274   /// Get the interleaved access group that \p Instr belongs to.
1275   const InterleaveGroup<Instruction> *
getInterleavedAccessGroup(Instruction * Instr)1276   getInterleavedAccessGroup(Instruction *Instr) {
1277     return InterleaveInfo.getInterleaveGroup(Instr);
1278   }
1279 
1280   /// Returns true if an interleaved group requires a scalar iteration
1281   /// to handle accesses with gaps, and there is nothing preventing us from
1282   /// creating a scalar epilogue.
requiresScalarEpilogue() const1283   bool requiresScalarEpilogue() const {
1284     return isScalarEpilogueAllowed() && InterleaveInfo.requiresScalarEpilogue();
1285   }
1286 
1287   /// Returns true if a scalar epilogue is not allowed due to optsize or a
1288   /// loop hint annotation.
isScalarEpilogueAllowed() const1289   bool isScalarEpilogueAllowed() const {
1290     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1291   }
1292 
1293   /// Returns true if all loop blocks should be masked to fold tail loop.
foldTailByMasking() const1294   bool foldTailByMasking() const { return FoldTailByMasking; }
1295 
blockNeedsPredication(BasicBlock * BB)1296   bool blockNeedsPredication(BasicBlock *BB) {
1297     return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1298   }
1299 
1300   /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1301   /// with factor VF.  Return the cost of the instruction, including
1302   /// scalarization overhead if it's needed.
1303   unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF);
1304 
1305   /// Estimate cost of a call instruction CI if it were vectorized with factor
1306   /// VF. Return the cost of the instruction, including scalarization overhead
1307   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
1308   /// scalarized -
1309   /// i.e. either vector version isn't available, or is too expensive.
1310   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
1311 
1312 private:
1313   unsigned NumPredStores = 0;
1314 
1315   /// \return An upper bound for the vectorization factor, larger than zero.
1316   /// One is returned if vectorization should best be avoided due to cost.
1317   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
1318 
1319   /// The vectorization cost is a combination of the cost itself and a boolean
1320   /// indicating whether any of the contributing operations will actually
1321   /// operate on
1322   /// vector values after type legalization in the backend. If this latter value
1323   /// is
1324   /// false, then all operations will be scalarized (i.e. no vectorization has
1325   /// actually taken place).
1326   using VectorizationCostTy = std::pair<unsigned, bool>;
1327 
1328   /// Returns the expected execution cost. The unit of the cost does
1329   /// not matter because we use the 'cost' units to compare different
1330   /// vector widths. The cost that is returned is *not* normalized by
1331   /// the factor width.
1332   VectorizationCostTy expectedCost(unsigned VF);
1333 
1334   /// Returns the execution time cost of an instruction for a given vector
1335   /// width. Vector width of one means scalar.
1336   VectorizationCostTy getInstructionCost(Instruction *I, unsigned VF);
1337 
1338   /// The cost-computation logic from getInstructionCost which provides
1339   /// the vector type as an output parameter.
1340   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
1341 
1342   /// Calculate vectorization cost of memory instruction \p I.
1343   unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
1344 
1345   /// The cost computation for scalarized memory instruction.
1346   unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
1347 
1348   /// The cost computation for interleaving group of memory instructions.
1349   unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
1350 
1351   /// The cost computation for Gather/Scatter instruction.
1352   unsigned getGatherScatterCost(Instruction *I, unsigned VF);
1353 
1354   /// The cost computation for widening instruction \p I with consecutive
1355   /// memory access.
1356   unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
1357 
1358   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1359   /// Load: scalar load + broadcast.
1360   /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1361   /// element)
1362   unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
1363 
1364   /// Estimate the overhead of scalarizing an instruction. This is a
1365   /// convenience wrapper for the type-based getScalarizationOverhead API.
1366   unsigned getScalarizationOverhead(Instruction *I, unsigned VF);
1367 
1368   /// Returns whether the instruction is a load or store and will be a emitted
1369   /// as a vector operation.
1370   bool isConsecutiveLoadOrStore(Instruction *I);
1371 
1372   /// Returns true if an artificially high cost for emulated masked memrefs
1373   /// should be used.
1374   bool useEmulatedMaskMemRefHack(Instruction *I);
1375 
1376   /// Map of scalar integer values to the smallest bitwidth they can be legally
1377   /// represented as. The vector equivalents of these values should be truncated
1378   /// to this type.
1379   MapVector<Instruction *, uint64_t> MinBWs;
1380 
1381   /// A type representing the costs for instructions if they were to be
1382   /// scalarized rather than vectorized. The entries are Instruction-Cost
1383   /// pairs.
1384   using ScalarCostsTy = DenseMap<Instruction *, unsigned>;
1385 
1386   /// A set containing all BasicBlocks that are known to present after
1387   /// vectorization as a predicated block.
1388   SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
1389 
1390   /// Records whether it is allowed to have the original scalar loop execute at
1391   /// least once. This may be needed as a fallback loop in case runtime
1392   /// aliasing/dependence checks fail, or to handle the tail/remainder
1393   /// iterations when the trip count is unknown or doesn't divide by the VF,
1394   /// or as a peel-loop to handle gaps in interleave-groups.
1395   /// Under optsize and when the trip count is very small we don't allow any
1396   /// iterations to execute in the scalar loop.
1397   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1398 
1399   /// All blocks of loop are to be masked to fold tail of scalar iterations.
1400   bool FoldTailByMasking = false;
1401 
1402   /// A map holding scalar costs for different vectorization factors. The
1403   /// presence of a cost for an instruction in the mapping indicates that the
1404   /// instruction will be scalarized when vectorizing with the associated
1405   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1406   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
1407 
1408   /// Holds the instructions known to be uniform after vectorization.
1409   /// The data is collected per VF.
1410   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
1411 
1412   /// Holds the instructions known to be scalar after vectorization.
1413   /// The data is collected per VF.
1414   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
1415 
1416   /// Holds the instructions (address computations) that are forced to be
1417   /// scalarized.
1418   DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1419 
1420   /// Returns the expected difference in cost from scalarizing the expression
1421   /// feeding a predicated instruction \p PredInst. The instructions to
1422   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1423   /// non-negative return value implies the expression will be scalarized.
1424   /// Currently, only single-use chains are considered for scalarization.
1425   int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
1426                               unsigned VF);
1427 
1428   /// Collect the instructions that are uniform after vectorization. An
1429   /// instruction is uniform if we represent it with a single scalar value in
1430   /// the vectorized loop corresponding to each vector iteration. Examples of
1431   /// uniform instructions include pointer operands of consecutive or
1432   /// interleaved memory accesses. Note that although uniformity implies an
1433   /// instruction will be scalar, the reverse is not true. In general, a
1434   /// scalarized instruction will be represented by VF scalar values in the
1435   /// vectorized loop, each corresponding to an iteration of the original
1436   /// scalar loop.
1437   void collectLoopUniforms(unsigned VF);
1438 
1439   /// Collect the instructions that are scalar after vectorization. An
1440   /// instruction is scalar if it is known to be uniform or will be scalarized
1441   /// during vectorization. Non-uniform scalarized instructions will be
1442   /// represented by VF values in the vectorized loop, each corresponding to an
1443   /// iteration of the original scalar loop.
1444   void collectLoopScalars(unsigned VF);
1445 
1446   /// Keeps cost model vectorization decision and cost for instructions.
1447   /// Right now it is used for memory instructions only.
1448   using DecisionList = DenseMap<std::pair<Instruction *, unsigned>,
1449                                 std::pair<InstWidening, unsigned>>;
1450 
1451   DecisionList WideningDecisions;
1452 
1453   /// Returns true if \p V is expected to be vectorized and it needs to be
1454   /// extracted.
needsExtract(Value * V,unsigned VF) const1455   bool needsExtract(Value *V, unsigned VF) const {
1456     Instruction *I = dyn_cast<Instruction>(V);
1457     if (VF == 1 || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I))
1458       return false;
1459 
1460     // Assume we can vectorize V (and hence we need extraction) if the
1461     // scalars are not computed yet. This can happen, because it is called
1462     // via getScalarizationOverhead from setCostBasedWideningDecision, before
1463     // the scalars are collected. That should be a safe assumption in most
1464     // cases, because we check if the operands have vectorizable types
1465     // beforehand in LoopVectorizationLegality.
1466     return Scalars.find(VF) == Scalars.end() ||
1467            !isScalarAfterVectorization(I, VF);
1468   };
1469 
1470   /// Returns a range containing only operands needing to be extracted.
filterExtractingOperands(Instruction::op_range Ops,unsigned VF)1471   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1472                                                    unsigned VF) {
1473     return SmallVector<Value *, 4>(make_filter_range(
1474         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1475   }
1476 
1477 public:
1478   /// The loop that we evaluate.
1479   Loop *TheLoop;
1480 
1481   /// Predicated scalar evolution analysis.
1482   PredicatedScalarEvolution &PSE;
1483 
1484   /// Loop Info analysis.
1485   LoopInfo *LI;
1486 
1487   /// Vectorization legality.
1488   LoopVectorizationLegality *Legal;
1489 
1490   /// Vector target information.
1491   const TargetTransformInfo &TTI;
1492 
1493   /// Target Library Info.
1494   const TargetLibraryInfo *TLI;
1495 
1496   /// Demanded bits analysis.
1497   DemandedBits *DB;
1498 
1499   /// Assumption cache.
1500   AssumptionCache *AC;
1501 
1502   /// Interface to emit optimization remarks.
1503   OptimizationRemarkEmitter *ORE;
1504 
1505   const Function *TheFunction;
1506 
1507   /// Loop Vectorize Hint.
1508   const LoopVectorizeHints *Hints;
1509 
1510   /// The interleave access information contains groups of interleaved accesses
1511   /// with the same stride and close to each other.
1512   InterleavedAccessInfo &InterleaveInfo;
1513 
1514   /// Values to ignore in the cost model.
1515   SmallPtrSet<const Value *, 16> ValuesToIgnore;
1516 
1517   /// Values to ignore in the cost model when VF > 1.
1518   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1519 };
1520 
1521 } // end namespace llvm
1522 
1523 // Return true if \p OuterLp is an outer loop annotated with hints for explicit
1524 // vectorization. The loop needs to be annotated with #pragma omp simd
1525 // simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
1526 // vector length information is not provided, vectorization is not considered
1527 // explicit. Interleave hints are not allowed either. These limitations will be
1528 // relaxed in the future.
1529 // Please, note that we are currently forced to abuse the pragma 'clang
1530 // vectorize' semantics. This pragma provides *auto-vectorization hints*
1531 // (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
1532 // provides *explicit vectorization hints* (LV can bypass legal checks and
1533 // assume that vectorization is legal). However, both hints are implemented
1534 // using the same metadata (llvm.loop.vectorize, processed by
1535 // LoopVectorizeHints). This will be fixed in the future when the native IR
1536 // representation for pragma 'omp simd' is introduced.
isExplicitVecOuterLoop(Loop * OuterLp,OptimizationRemarkEmitter * ORE)1537 static bool isExplicitVecOuterLoop(Loop *OuterLp,
1538                                    OptimizationRemarkEmitter *ORE) {
1539   assert(!OuterLp->empty() && "This is not an outer loop");
1540   LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
1541 
1542   // Only outer loops with an explicit vectorization hint are supported.
1543   // Unannotated outer loops are ignored.
1544   if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
1545     return false;
1546 
1547   Function *Fn = OuterLp->getHeader()->getParent();
1548   if (!Hints.allowVectorization(Fn, OuterLp,
1549                                 true /*VectorizeOnlyWhenForced*/)) {
1550     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
1551     return false;
1552   }
1553 
1554   if (Hints.getInterleave() > 1) {
1555     // TODO: Interleave support is future work.
1556     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
1557                          "outer loops.\n");
1558     Hints.emitRemarkWithHints();
1559     return false;
1560   }
1561 
1562   return true;
1563 }
1564 
collectSupportedLoops(Loop & L,LoopInfo * LI,OptimizationRemarkEmitter * ORE,SmallVectorImpl<Loop * > & V)1565 static void collectSupportedLoops(Loop &L, LoopInfo *LI,
1566                                   OptimizationRemarkEmitter *ORE,
1567                                   SmallVectorImpl<Loop *> &V) {
1568   // Collect inner loops and outer loops without irreducible control flow. For
1569   // now, only collect outer loops that have explicit vectorization hints. If we
1570   // are stress testing the VPlan H-CFG construction, we collect the outermost
1571   // loop of every loop nest.
1572   if (L.empty() || VPlanBuildStressTest ||
1573       (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
1574     LoopBlocksRPO RPOT(&L);
1575     RPOT.perform(LI);
1576     if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
1577       V.push_back(&L);
1578       // TODO: Collect inner loops inside marked outer loops in case
1579       // vectorization fails for the outer loop. Do not invoke
1580       // 'containsIrreducibleCFG' again for inner loops when the outer loop is
1581       // already known to be reducible. We can use an inherited attribute for
1582       // that.
1583       return;
1584     }
1585   }
1586   for (Loop *InnerL : L)
1587     collectSupportedLoops(*InnerL, LI, ORE, V);
1588 }
1589 
1590 namespace {
1591 
1592 /// The LoopVectorize Pass.
1593 struct LoopVectorize : public FunctionPass {
1594   /// Pass identification, replacement for typeid
1595   static char ID;
1596 
1597   LoopVectorizePass Impl;
1598 
LoopVectorize__anon27d292930211::LoopVectorize1599   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
1600                          bool VectorizeOnlyWhenForced = false)
1601       : FunctionPass(ID) {
1602     Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
1603     Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
1604     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
1605   }
1606 
runOnFunction__anon27d292930211::LoopVectorize1607   bool runOnFunction(Function &F) override {
1608     if (skipFunction(F))
1609       return false;
1610 
1611     auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
1612     auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
1613     auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
1614     auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
1615     auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
1616     auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
1617     auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
1618     auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1619     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
1620     auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
1621     auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
1622     auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
1623     auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
1624 
1625     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
1626         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
1627 
1628     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
1629                         GetLAA, *ORE, PSI);
1630   }
1631 
getAnalysisUsage__anon27d292930211::LoopVectorize1632   void getAnalysisUsage(AnalysisUsage &AU) const override {
1633     AU.addRequired<AssumptionCacheTracker>();
1634     AU.addRequired<BlockFrequencyInfoWrapperPass>();
1635     AU.addRequired<DominatorTreeWrapperPass>();
1636     AU.addRequired<LoopInfoWrapperPass>();
1637     AU.addRequired<ScalarEvolutionWrapperPass>();
1638     AU.addRequired<TargetTransformInfoWrapperPass>();
1639     AU.addRequired<AAResultsWrapperPass>();
1640     AU.addRequired<LoopAccessLegacyAnalysis>();
1641     AU.addRequired<DemandedBitsWrapperPass>();
1642     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
1643 
1644     // We currently do not preserve loopinfo/dominator analyses with outer loop
1645     // vectorization. Until this is addressed, mark these analyses as preserved
1646     // only for non-VPlan-native path.
1647     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
1648     if (!EnableVPlanNativePath) {
1649       AU.addPreserved<LoopInfoWrapperPass>();
1650       AU.addPreserved<DominatorTreeWrapperPass>();
1651     }
1652 
1653     AU.addPreserved<BasicAAWrapperPass>();
1654     AU.addPreserved<GlobalsAAWrapperPass>();
1655     AU.addRequired<ProfileSummaryInfoWrapperPass>();
1656   }
1657 };
1658 
1659 } // end anonymous namespace
1660 
1661 //===----------------------------------------------------------------------===//
1662 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
1663 // LoopVectorizationCostModel and LoopVectorizationPlanner.
1664 //===----------------------------------------------------------------------===//
1665 
getBroadcastInstrs(Value * V)1666 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
1667   // We need to place the broadcast of invariant variables outside the loop,
1668   // but only if it's proven safe to do so. Else, broadcast will be inside
1669   // vector loop body.
1670   Instruction *Instr = dyn_cast<Instruction>(V);
1671   bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
1672                      (!Instr ||
1673                       DT->dominates(Instr->getParent(), LoopVectorPreHeader));
1674   // Place the code for broadcasting invariant variables in the new preheader.
1675   IRBuilder<>::InsertPointGuard Guard(Builder);
1676   if (SafeToHoist)
1677     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1678 
1679   // Broadcast the scalar into all locations in the vector.
1680   Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
1681 
1682   return Shuf;
1683 }
1684 
createVectorIntOrFpInductionPHI(const InductionDescriptor & II,Value * Step,Instruction * EntryVal)1685 void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
1686     const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
1687   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1688          "Expected either an induction phi-node or a truncate of it!");
1689   Value *Start = II.getStartValue();
1690 
1691   // Construct the initial value of the vector IV in the vector loop preheader
1692   auto CurrIP = Builder.saveIP();
1693   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
1694   if (isa<TruncInst>(EntryVal)) {
1695     assert(Start->getType()->isIntegerTy() &&
1696            "Truncation requires an integer type");
1697     auto *TruncType = cast<IntegerType>(EntryVal->getType());
1698     Step = Builder.CreateTrunc(Step, TruncType);
1699     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
1700   }
1701   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
1702   Value *SteppedStart =
1703       getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
1704 
1705   // We create vector phi nodes for both integer and floating-point induction
1706   // variables. Here, we determine the kind of arithmetic we will perform.
1707   Instruction::BinaryOps AddOp;
1708   Instruction::BinaryOps MulOp;
1709   if (Step->getType()->isIntegerTy()) {
1710     AddOp = Instruction::Add;
1711     MulOp = Instruction::Mul;
1712   } else {
1713     AddOp = II.getInductionOpcode();
1714     MulOp = Instruction::FMul;
1715   }
1716 
1717   // Multiply the vectorization factor by the step using integer or
1718   // floating-point arithmetic as appropriate.
1719   Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
1720   Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
1721 
1722   // Create a vector splat to use in the induction update.
1723   //
1724   // FIXME: If the step is non-constant, we create the vector splat with
1725   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1726   //        handle a constant vector splat.
1727   Value *SplatVF = isa<Constant>(Mul)
1728                        ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
1729                        : Builder.CreateVectorSplat(VF, Mul);
1730   Builder.restoreIP(CurrIP);
1731 
1732   // We may need to add the step a number of times, depending on the unroll
1733   // factor. The last of those goes into the PHI.
1734   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
1735                                     &*LoopVectorBody->getFirstInsertionPt());
1736   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1737   Instruction *LastInduction = VecInd;
1738   for (unsigned Part = 0; Part < UF; ++Part) {
1739     VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
1740 
1741     if (isa<TruncInst>(EntryVal))
1742       addMetadata(LastInduction, EntryVal);
1743     recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
1744 
1745     LastInduction = cast<Instruction>(addFastMathFlag(
1746         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
1747     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1748   }
1749 
1750   // Move the last step to the end of the latch block. This ensures consistent
1751   // placement of all induction updates.
1752   auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
1753   auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
1754   auto *ICmp = cast<Instruction>(Br->getCondition());
1755   LastInduction->moveBefore(ICmp);
1756   LastInduction->setName("vec.ind.next");
1757 
1758   VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
1759   VecInd->addIncoming(LastInduction, LoopVectorLatch);
1760 }
1761 
shouldScalarizeInstruction(Instruction * I) const1762 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
1763   return Cost->isScalarAfterVectorization(I, VF) ||
1764          Cost->isProfitableToScalarize(I, VF);
1765 }
1766 
needsScalarInduction(Instruction * IV) const1767 bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
1768   if (shouldScalarizeInstruction(IV))
1769     return true;
1770   auto isScalarInst = [&](User *U) -> bool {
1771     auto *I = cast<Instruction>(U);
1772     return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
1773   };
1774   return llvm::any_of(IV->users(), isScalarInst);
1775 }
1776 
recordVectorLoopValueForInductionCast(const InductionDescriptor & ID,const Instruction * EntryVal,Value * VectorLoopVal,unsigned Part,unsigned Lane)1777 void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
1778     const InductionDescriptor &ID, const Instruction *EntryVal,
1779     Value *VectorLoopVal, unsigned Part, unsigned Lane) {
1780   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
1781          "Expected either an induction phi-node or a truncate of it!");
1782 
1783   // This induction variable is not the phi from the original loop but the
1784   // newly-created IV based on the proof that casted Phi is equal to the
1785   // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
1786   // re-uses the same InductionDescriptor that original IV uses but we don't
1787   // have to do any recording in this case - that is done when original IV is
1788   // processed.
1789   if (isa<TruncInst>(EntryVal))
1790     return;
1791 
1792   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
1793   if (Casts.empty())
1794     return;
1795   // Only the first Cast instruction in the Casts vector is of interest.
1796   // The rest of the Casts (if exist) have no uses outside the
1797   // induction update chain itself.
1798   Instruction *CastInst = *Casts.begin();
1799   if (Lane < UINT_MAX)
1800     VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
1801   else
1802     VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
1803 }
1804 
widenIntOrFpInduction(PHINode * IV,TruncInst * Trunc)1805 void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
1806   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
1807          "Primary induction variable must have an integer type");
1808 
1809   auto II = Legal->getInductionVars()->find(IV);
1810   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
1811 
1812   auto ID = II->second;
1813   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
1814 
1815   // The scalar value to broadcast. This will be derived from the canonical
1816   // induction variable.
1817   Value *ScalarIV = nullptr;
1818 
1819   // The value from the original loop to which we are mapping the new induction
1820   // variable.
1821   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
1822 
1823   // True if we have vectorized the induction variable.
1824   auto VectorizedIV = false;
1825 
1826   // Determine if we want a scalar version of the induction variable. This is
1827   // true if the induction variable itself is not widened, or if it has at
1828   // least one user in the loop that is not widened.
1829   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
1830 
1831   // Generate code for the induction step. Note that induction steps are
1832   // required to be loop-invariant
1833   assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
1834          "Induction step should be loop invariant");
1835   auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
1836   Value *Step = nullptr;
1837   if (PSE.getSE()->isSCEVable(IV->getType())) {
1838     SCEVExpander Exp(*PSE.getSE(), DL, "induction");
1839     Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
1840                              LoopVectorPreHeader->getTerminator());
1841   } else {
1842     Step = cast<SCEVUnknown>(ID.getStep())->getValue();
1843   }
1844 
1845   // Try to create a new independent vector induction variable. If we can't
1846   // create the phi node, we will splat the scalar induction variable in each
1847   // loop iteration.
1848   if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
1849     createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
1850     VectorizedIV = true;
1851   }
1852 
1853   // If we haven't yet vectorized the induction variable, or if we will create
1854   // a scalar one, we need to define the scalar induction variable and step
1855   // values. If we were given a truncation type, truncate the canonical
1856   // induction variable and step. Otherwise, derive these values from the
1857   // induction descriptor.
1858   if (!VectorizedIV || NeedsScalarIV) {
1859     ScalarIV = Induction;
1860     if (IV != OldInduction) {
1861       ScalarIV = IV->getType()->isIntegerTy()
1862                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
1863                      : Builder.CreateCast(Instruction::SIToFP, Induction,
1864                                           IV->getType());
1865       ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
1866       ScalarIV->setName("offset.idx");
1867     }
1868     if (Trunc) {
1869       auto *TruncType = cast<IntegerType>(Trunc->getType());
1870       assert(Step->getType()->isIntegerTy() &&
1871              "Truncation requires an integer step");
1872       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
1873       Step = Builder.CreateTrunc(Step, TruncType);
1874     }
1875   }
1876 
1877   // If we haven't yet vectorized the induction variable, splat the scalar
1878   // induction variable, and build the necessary step vectors.
1879   // TODO: Don't do it unless the vectorized IV is really required.
1880   if (!VectorizedIV) {
1881     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
1882     for (unsigned Part = 0; Part < UF; ++Part) {
1883       Value *EntryPart =
1884           getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
1885       VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
1886       if (Trunc)
1887         addMetadata(EntryPart, Trunc);
1888       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
1889     }
1890   }
1891 
1892   // If an induction variable is only used for counting loop iterations or
1893   // calculating addresses, it doesn't need to be widened. Create scalar steps
1894   // that can be used by instructions we will later scalarize. Note that the
1895   // addition of the scalar steps will not increase the number of instructions
1896   // in the loop in the common case prior to InstCombine. We will be trading
1897   // one vector extract for each scalar step.
1898   if (NeedsScalarIV)
1899     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
1900 }
1901 
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)1902 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
1903                                           Instruction::BinaryOps BinOp) {
1904   // Create and check the types.
1905   assert(Val->getType()->isVectorTy() && "Must be a vector");
1906   int VLen = Val->getType()->getVectorNumElements();
1907 
1908   Type *STy = Val->getType()->getScalarType();
1909   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
1910          "Induction Step must be an integer or FP");
1911   assert(Step->getType() == STy && "Step has wrong type");
1912 
1913   SmallVector<Constant *, 8> Indices;
1914 
1915   if (STy->isIntegerTy()) {
1916     // Create a vector of consecutive numbers from zero to VF.
1917     for (int i = 0; i < VLen; ++i)
1918       Indices.push_back(ConstantInt::get(STy, StartIdx + i));
1919 
1920     // Add the consecutive indices to the vector value.
1921     Constant *Cv = ConstantVector::get(Indices);
1922     assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
1923     Step = Builder.CreateVectorSplat(VLen, Step);
1924     assert(Step->getType() == Val->getType() && "Invalid step vec");
1925     // FIXME: The newly created binary instructions should contain nsw/nuw flags,
1926     // which can be found from the original scalar operations.
1927     Step = Builder.CreateMul(Cv, Step);
1928     return Builder.CreateAdd(Val, Step, "induction");
1929   }
1930 
1931   // Floating point induction.
1932   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
1933          "Binary Opcode should be specified for FP induction");
1934   // Create a vector of consecutive numbers from zero to VF.
1935   for (int i = 0; i < VLen; ++i)
1936     Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
1937 
1938   // Add the consecutive indices to the vector value.
1939   Constant *Cv = ConstantVector::get(Indices);
1940 
1941   Step = Builder.CreateVectorSplat(VLen, Step);
1942 
1943   // Floating point operations had to be 'fast' to enable the induction.
1944   FastMathFlags Flags;
1945   Flags.setFast();
1946 
1947   Value *MulOp = Builder.CreateFMul(Cv, Step);
1948   if (isa<Instruction>(MulOp))
1949     // Have to check, MulOp may be a constant
1950     cast<Instruction>(MulOp)->setFastMathFlags(Flags);
1951 
1952   Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
1953   if (isa<Instruction>(BOp))
1954     cast<Instruction>(BOp)->setFastMathFlags(Flags);
1955   return BOp;
1956 }
1957 
buildScalarSteps(Value * ScalarIV,Value * Step,Instruction * EntryVal,const InductionDescriptor & ID)1958 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
1959                                            Instruction *EntryVal,
1960                                            const InductionDescriptor &ID) {
1961   // We shouldn't have to build scalar steps if we aren't vectorizing.
1962   assert(VF > 1 && "VF should be greater than one");
1963 
1964   // Get the value type and ensure it and the step have the same integer type.
1965   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
1966   assert(ScalarIVTy == Step->getType() &&
1967          "Val and Step should have the same type");
1968 
1969   // We build scalar steps for both integer and floating-point induction
1970   // variables. Here, we determine the kind of arithmetic we will perform.
1971   Instruction::BinaryOps AddOp;
1972   Instruction::BinaryOps MulOp;
1973   if (ScalarIVTy->isIntegerTy()) {
1974     AddOp = Instruction::Add;
1975     MulOp = Instruction::Mul;
1976   } else {
1977     AddOp = ID.getInductionOpcode();
1978     MulOp = Instruction::FMul;
1979   }
1980 
1981   // Determine the number of scalars we need to generate for each unroll
1982   // iteration. If EntryVal is uniform, we only need to generate the first
1983   // lane. Otherwise, we generate all VF values.
1984   unsigned Lanes =
1985       Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1
1986                                                                          : VF;
1987   // Compute the scalar steps and save the results in VectorLoopValueMap.
1988   for (unsigned Part = 0; Part < UF; ++Part) {
1989     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
1990       auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
1991       auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
1992       auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
1993       VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
1994       recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
1995     }
1996   }
1997 }
1998 
getOrCreateVectorValue(Value * V,unsigned Part)1999 Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
2000   assert(V != Induction && "The new induction variable should not be used.");
2001   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
2002   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2003 
2004   // If we have a stride that is replaced by one, do it here. Defer this for
2005   // the VPlan-native path until we start running Legal checks in that path.
2006   if (!EnableVPlanNativePath && Legal->hasStride(V))
2007     V = ConstantInt::get(V->getType(), 1);
2008 
2009   // If we have a vector mapped to this value, return it.
2010   if (VectorLoopValueMap.hasVectorValue(V, Part))
2011     return VectorLoopValueMap.getVectorValue(V, Part);
2012 
2013   // If the value has not been vectorized, check if it has been scalarized
2014   // instead. If it has been scalarized, and we actually need the value in
2015   // vector form, we will construct the vector values on demand.
2016   if (VectorLoopValueMap.hasAnyScalarValue(V)) {
2017     Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
2018 
2019     // If we've scalarized a value, that value should be an instruction.
2020     auto *I = cast<Instruction>(V);
2021 
2022     // If we aren't vectorizing, we can just copy the scalar map values over to
2023     // the vector map.
2024     if (VF == 1) {
2025       VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
2026       return ScalarValue;
2027     }
2028 
2029     // Get the last scalar instruction we generated for V and Part. If the value
2030     // is known to be uniform after vectorization, this corresponds to lane zero
2031     // of the Part unroll iteration. Otherwise, the last instruction is the one
2032     // we created for the last vector lane of the Part unroll iteration.
2033     unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
2034     auto *LastInst = cast<Instruction>(
2035         VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
2036 
2037     // Set the insert point after the last scalarized instruction. This ensures
2038     // the insertelement sequence will directly follow the scalar definitions.
2039     auto OldIP = Builder.saveIP();
2040     auto NewIP = std::next(BasicBlock::iterator(LastInst));
2041     Builder.SetInsertPoint(&*NewIP);
2042 
2043     // However, if we are vectorizing, we need to construct the vector values.
2044     // If the value is known to be uniform after vectorization, we can just
2045     // broadcast the scalar value corresponding to lane zero for each unroll
2046     // iteration. Otherwise, we construct the vector values using insertelement
2047     // instructions. Since the resulting vectors are stored in
2048     // VectorLoopValueMap, we will only generate the insertelements once.
2049     Value *VectorValue = nullptr;
2050     if (Cost->isUniformAfterVectorization(I, VF)) {
2051       VectorValue = getBroadcastInstrs(ScalarValue);
2052       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
2053     } else {
2054       // Initialize packing with insertelements to start from undef.
2055       Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
2056       VectorLoopValueMap.setVectorValue(V, Part, Undef);
2057       for (unsigned Lane = 0; Lane < VF; ++Lane)
2058         packScalarIntoVectorValue(V, {Part, Lane});
2059       VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
2060     }
2061     Builder.restoreIP(OldIP);
2062     return VectorValue;
2063   }
2064 
2065   // If this scalar is unknown, assume that it is a constant or that it is
2066   // loop invariant. Broadcast V and save the value for future uses.
2067   Value *B = getBroadcastInstrs(V);
2068   VectorLoopValueMap.setVectorValue(V, Part, B);
2069   return B;
2070 }
2071 
2072 Value *
getOrCreateScalarValue(Value * V,const VPIteration & Instance)2073 InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
2074                                             const VPIteration &Instance) {
2075   // If the value is not an instruction contained in the loop, it should
2076   // already be scalar.
2077   if (OrigLoop->isLoopInvariant(V))
2078     return V;
2079 
2080   assert(Instance.Lane > 0
2081              ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
2082              : true && "Uniform values only have lane zero");
2083 
2084   // If the value from the original loop has not been vectorized, it is
2085   // represented by UF x VF scalar values in the new loop. Return the requested
2086   // scalar value.
2087   if (VectorLoopValueMap.hasScalarValue(V, Instance))
2088     return VectorLoopValueMap.getScalarValue(V, Instance);
2089 
2090   // If the value has not been scalarized, get its entry in VectorLoopValueMap
2091   // for the given unroll part. If this entry is not a vector type (i.e., the
2092   // vectorization factor is one), there is no need to generate an
2093   // extractelement instruction.
2094   auto *U = getOrCreateVectorValue(V, Instance.Part);
2095   if (!U->getType()->isVectorTy()) {
2096     assert(VF == 1 && "Value not scalarized has non-vector type");
2097     return U;
2098   }
2099 
2100   // Otherwise, the value from the original loop has been vectorized and is
2101   // represented by UF vector values. Extract and return the requested scalar
2102   // value from the appropriate vector lane.
2103   return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
2104 }
2105 
packScalarIntoVectorValue(Value * V,const VPIteration & Instance)2106 void InnerLoopVectorizer::packScalarIntoVectorValue(
2107     Value *V, const VPIteration &Instance) {
2108   assert(V != Induction && "The new induction variable should not be used.");
2109   assert(!V->getType()->isVectorTy() && "Can't pack a vector");
2110   assert(!V->getType()->isVoidTy() && "Type does not produce a value");
2111 
2112   Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
2113   Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
2114   VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
2115                                             Builder.getInt32(Instance.Lane));
2116   VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
2117 }
2118 
reverseVector(Value * Vec)2119 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
2120   assert(Vec->getType()->isVectorTy() && "Invalid type");
2121   SmallVector<Constant *, 8> ShuffleMask;
2122   for (unsigned i = 0; i < VF; ++i)
2123     ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
2124 
2125   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
2126                                      ConstantVector::get(ShuffleMask),
2127                                      "reverse");
2128 }
2129 
2130 // Return whether we allow using masked interleave-groups (for dealing with
2131 // strided loads/stores that reside in predicated blocks, or for dealing
2132 // with gaps).
useMaskedInterleavedAccesses(const TargetTransformInfo & TTI)2133 static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2134   // If an override option has been passed in for interleaved accesses, use it.
2135   if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2136     return EnableMaskedInterleavedMemAccesses;
2137 
2138   return TTI.enableMaskedInterleavedAccessVectorization();
2139 }
2140 
2141 // Try to vectorize the interleave group that \p Instr belongs to.
2142 //
2143 // E.g. Translate following interleaved load group (factor = 3):
2144 //   for (i = 0; i < N; i+=3) {
2145 //     R = Pic[i];             // Member of index 0
2146 //     G = Pic[i+1];           // Member of index 1
2147 //     B = Pic[i+2];           // Member of index 2
2148 //     ... // do something to R, G, B
2149 //   }
2150 // To:
2151 //   %wide.vec = load <12 x i32>                       ; Read 4 tuples of R,G,B
2152 //   %R.vec = shuffle %wide.vec, undef, <0, 3, 6, 9>   ; R elements
2153 //   %G.vec = shuffle %wide.vec, undef, <1, 4, 7, 10>  ; G elements
2154 //   %B.vec = shuffle %wide.vec, undef, <2, 5, 8, 11>  ; B elements
2155 //
2156 // Or translate following interleaved store group (factor = 3):
2157 //   for (i = 0; i < N; i+=3) {
2158 //     ... do something to R, G, B
2159 //     Pic[i]   = R;           // Member of index 0
2160 //     Pic[i+1] = G;           // Member of index 1
2161 //     Pic[i+2] = B;           // Member of index 2
2162 //   }
2163 // To:
2164 //   %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
2165 //   %B_U.vec = shuffle %B.vec, undef, <0, 1, 2, 3, u, u, u, u>
2166 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
2167 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
2168 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
vectorizeInterleaveGroup(Instruction * Instr,VPTransformState & State,VPValue * Addr,VPValue * BlockInMask)2169 void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
2170                                                    VPTransformState &State,
2171                                                    VPValue *Addr,
2172                                                    VPValue *BlockInMask) {
2173   const InterleaveGroup<Instruction> *Group =
2174       Cost->getInterleavedAccessGroup(Instr);
2175   assert(Group && "Fail to get an interleaved access group.");
2176 
2177   // Skip if current instruction is not the insert position.
2178   if (Instr != Group->getInsertPos())
2179     return;
2180 
2181   const DataLayout &DL = Instr->getModule()->getDataLayout();
2182 
2183   // Prepare for the vector type of the interleaved load/store.
2184   Type *ScalarTy = getMemInstValueType(Instr);
2185   unsigned InterleaveFactor = Group->getFactor();
2186   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
2187 
2188   // Prepare for the new pointers.
2189   SmallVector<Value *, 2> AddrParts;
2190   unsigned Index = Group->getIndex(Instr);
2191 
2192   // TODO: extend the masked interleaved-group support to reversed access.
2193   assert((!BlockInMask || !Group->isReverse()) &&
2194          "Reversed masked interleave-group not supported.");
2195 
2196   // If the group is reverse, adjust the index to refer to the last vector lane
2197   // instead of the first. We adjust the index from the first vector lane,
2198   // rather than directly getting the pointer for lane VF - 1, because the
2199   // pointer operand of the interleaved access is supposed to be uniform. For
2200   // uniform instructions, we're only required to generate a value for the
2201   // first vector lane in each unroll iteration.
2202   if (Group->isReverse())
2203     Index += (VF - 1) * Group->getFactor();
2204 
2205   for (unsigned Part = 0; Part < UF; Part++) {
2206     Value *AddrPart = State.get(Addr, {Part, 0});
2207     setDebugLocFromInst(Builder, AddrPart);
2208 
2209     // Notice current instruction could be any index. Need to adjust the address
2210     // to the member of index 0.
2211     //
2212     // E.g.  a = A[i+1];     // Member of index 1 (Current instruction)
2213     //       b = A[i];       // Member of index 0
2214     // Current pointer is pointed to A[i+1], adjust it to A[i].
2215     //
2216     // E.g.  A[i+1] = a;     // Member of index 1
2217     //       A[i]   = b;     // Member of index 0
2218     //       A[i+2] = c;     // Member of index 2 (Current instruction)
2219     // Current pointer is pointed to A[i+2], adjust it to A[i].
2220 
2221     bool InBounds = false;
2222     if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
2223       InBounds = gep->isInBounds();
2224     AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
2225     cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
2226 
2227     // Cast to the vector pointer type.
2228     unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
2229     Type *PtrTy = VecTy->getPointerTo(AddressSpace);
2230     AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
2231   }
2232 
2233   setDebugLocFromInst(Builder, Instr);
2234   Value *UndefVec = UndefValue::get(VecTy);
2235 
2236   Value *MaskForGaps = nullptr;
2237   if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
2238     MaskForGaps = createBitMaskForGaps(Builder, VF, *Group);
2239     assert(MaskForGaps && "Mask for Gaps is required but it is null");
2240   }
2241 
2242   // Vectorize the interleaved load group.
2243   if (isa<LoadInst>(Instr)) {
2244     // For each unroll part, create a wide load for the group.
2245     SmallVector<Value *, 2> NewLoads;
2246     for (unsigned Part = 0; Part < UF; Part++) {
2247       Instruction *NewLoad;
2248       if (BlockInMask || MaskForGaps) {
2249         assert(useMaskedInterleavedAccesses(*TTI) &&
2250                "masked interleaved groups are not allowed.");
2251         Value *GroupMask = MaskForGaps;
2252         if (BlockInMask) {
2253           Value *BlockInMaskPart = State.get(BlockInMask, Part);
2254           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2255           auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2256           Value *ShuffledMask = Builder.CreateShuffleVector(
2257               BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2258           GroupMask = MaskForGaps
2259                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
2260                                                 MaskForGaps)
2261                           : ShuffledMask;
2262         }
2263         NewLoad =
2264             Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
2265                                      GroupMask, UndefVec, "wide.masked.vec");
2266       }
2267       else
2268         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
2269                                             Group->getAlignment(), "wide.vec");
2270       Group->addMetadata(NewLoad);
2271       NewLoads.push_back(NewLoad);
2272     }
2273 
2274     // For each member in the group, shuffle out the appropriate data from the
2275     // wide loads.
2276     for (unsigned I = 0; I < InterleaveFactor; ++I) {
2277       Instruction *Member = Group->getMember(I);
2278 
2279       // Skip the gaps in the group.
2280       if (!Member)
2281         continue;
2282 
2283       Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
2284       for (unsigned Part = 0; Part < UF; Part++) {
2285         Value *StridedVec = Builder.CreateShuffleVector(
2286             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
2287 
2288         // If this member has different type, cast the result type.
2289         if (Member->getType() != ScalarTy) {
2290           VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
2291           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
2292         }
2293 
2294         if (Group->isReverse())
2295           StridedVec = reverseVector(StridedVec);
2296 
2297         VectorLoopValueMap.setVectorValue(Member, Part, StridedVec);
2298       }
2299     }
2300     return;
2301   }
2302 
2303   // The sub vector type for current instruction.
2304   VectorType *SubVT = VectorType::get(ScalarTy, VF);
2305 
2306   // Vectorize the interleaved store group.
2307   for (unsigned Part = 0; Part < UF; Part++) {
2308     // Collect the stored vector from each member.
2309     SmallVector<Value *, 4> StoredVecs;
2310     for (unsigned i = 0; i < InterleaveFactor; i++) {
2311       // Interleaved store group doesn't allow a gap, so each index has a member
2312       Instruction *Member = Group->getMember(i);
2313       assert(Member && "Fail to get a member from an interleaved store group");
2314 
2315       Value *StoredVec = getOrCreateVectorValue(
2316           cast<StoreInst>(Member)->getValueOperand(), Part);
2317       if (Group->isReverse())
2318         StoredVec = reverseVector(StoredVec);
2319 
2320       // If this member has different type, cast it to a unified type.
2321 
2322       if (StoredVec->getType() != SubVT)
2323         StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
2324 
2325       StoredVecs.push_back(StoredVec);
2326     }
2327 
2328     // Concatenate all vectors into a wide vector.
2329     Value *WideVec = concatenateVectors(Builder, StoredVecs);
2330 
2331     // Interleave the elements in the wide vector.
2332     Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
2333     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
2334                                               "interleaved.vec");
2335 
2336     Instruction *NewStoreInstr;
2337     if (BlockInMask) {
2338       Value *BlockInMaskPart = State.get(BlockInMask, Part);
2339       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
2340       auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
2341       Value *ShuffledMask = Builder.CreateShuffleVector(
2342           BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
2343       NewStoreInstr = Builder.CreateMaskedStore(
2344           IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
2345     }
2346     else
2347       NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
2348                                                  Group->getAlignment());
2349 
2350     Group->addMetadata(NewStoreInstr);
2351   }
2352 }
2353 
vectorizeMemoryInstruction(Instruction * Instr,VPTransformState & State,VPValue * Addr,VPValue * BlockInMask)2354 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
2355                                                      VPTransformState &State,
2356                                                      VPValue *Addr,
2357                                                      VPValue *BlockInMask) {
2358   // Attempt to issue a wide load.
2359   LoadInst *LI = dyn_cast<LoadInst>(Instr);
2360   StoreInst *SI = dyn_cast<StoreInst>(Instr);
2361 
2362   assert((LI || SI) && "Invalid Load/Store instruction");
2363 
2364   LoopVectorizationCostModel::InstWidening Decision =
2365       Cost->getWideningDecision(Instr, VF);
2366   assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
2367          "CM decision should be taken at this point");
2368   if (Decision == LoopVectorizationCostModel::CM_Interleave)
2369     return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
2370 
2371   Type *ScalarDataTy = getMemInstValueType(Instr);
2372   Type *DataTy = VectorType::get(ScalarDataTy, VF);
2373   // An alignment of 0 means target abi alignment. We need to use the scalar's
2374   // target abi alignment in such a case.
2375   const DataLayout &DL = Instr->getModule()->getDataLayout();
2376   const Align Alignment =
2377       DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
2378 
2379   // Determine if the pointer operand of the access is either consecutive or
2380   // reverse consecutive.
2381   bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
2382   bool ConsecutiveStride =
2383       Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
2384   bool CreateGatherScatter =
2385       (Decision == LoopVectorizationCostModel::CM_GatherScatter);
2386 
2387   // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
2388   // gather/scatter. Otherwise Decision should have been to Scalarize.
2389   assert((ConsecutiveStride || CreateGatherScatter) &&
2390          "The instruction should be scalarized");
2391   (void)ConsecutiveStride;
2392 
2393   VectorParts BlockInMaskParts(UF);
2394   bool isMaskRequired = BlockInMask;
2395   if (isMaskRequired)
2396     for (unsigned Part = 0; Part < UF; ++Part)
2397       BlockInMaskParts[Part] = State.get(BlockInMask, Part);
2398 
2399   const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
2400     // Calculate the pointer for the specific unroll-part.
2401     GetElementPtrInst *PartPtr = nullptr;
2402 
2403     bool InBounds = false;
2404     if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
2405       InBounds = gep->isInBounds();
2406 
2407     if (Reverse) {
2408       // If the address is consecutive but reversed, then the
2409       // wide store needs to start at the last vector element.
2410       PartPtr = cast<GetElementPtrInst>(
2411           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(-Part * VF)));
2412       PartPtr->setIsInBounds(InBounds);
2413       PartPtr = cast<GetElementPtrInst>(
2414           Builder.CreateGEP(ScalarDataTy, PartPtr, Builder.getInt32(1 - VF)));
2415       PartPtr->setIsInBounds(InBounds);
2416       if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
2417         BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
2418     } else {
2419       PartPtr = cast<GetElementPtrInst>(
2420           Builder.CreateGEP(ScalarDataTy, Ptr, Builder.getInt32(Part * VF)));
2421       PartPtr->setIsInBounds(InBounds);
2422     }
2423 
2424     unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
2425     return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
2426   };
2427 
2428   // Handle Stores:
2429   if (SI) {
2430     setDebugLocFromInst(Builder, SI);
2431 
2432     for (unsigned Part = 0; Part < UF; ++Part) {
2433       Instruction *NewSI = nullptr;
2434       Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
2435       if (CreateGatherScatter) {
2436         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2437         Value *VectorGep = State.get(Addr, Part);
2438         NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
2439                                             Alignment.value(), MaskPart);
2440       } else {
2441         if (Reverse) {
2442           // If we store to reverse consecutive memory locations, then we need
2443           // to reverse the order of elements in the stored value.
2444           StoredVal = reverseVector(StoredVal);
2445           // We don't want to update the value in the map as it might be used in
2446           // another expression. So don't call resetVectorValue(StoredVal).
2447         }
2448         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2449         if (isMaskRequired)
2450           NewSI = Builder.CreateMaskedStore(
2451               StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
2452         else
2453           NewSI =
2454               Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
2455       }
2456       addMetadata(NewSI, SI);
2457     }
2458     return;
2459   }
2460 
2461   // Handle loads.
2462   assert(LI && "Must have a load instruction");
2463   setDebugLocFromInst(Builder, LI);
2464   for (unsigned Part = 0; Part < UF; ++Part) {
2465     Value *NewLI;
2466     if (CreateGatherScatter) {
2467       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
2468       Value *VectorGep = State.get(Addr, Part);
2469       NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
2470                                          nullptr, "wide.masked.gather");
2471       addMetadata(NewLI, LI);
2472     } else {
2473       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
2474       if (isMaskRequired)
2475         NewLI = Builder.CreateMaskedLoad(
2476             VecPtr, Alignment.value(), BlockInMaskParts[Part],
2477             UndefValue::get(DataTy), "wide.masked.load");
2478       else
2479         NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
2480                                           "wide.load");
2481 
2482       // Add metadata to the load, but setVectorValue to the reverse shuffle.
2483       addMetadata(NewLI, LI);
2484       if (Reverse)
2485         NewLI = reverseVector(NewLI);
2486     }
2487     VectorLoopValueMap.setVectorValue(Instr, Part, NewLI);
2488   }
2489 }
2490 
scalarizeInstruction(Instruction * Instr,const VPIteration & Instance,bool IfPredicateInstr)2491 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
2492                                                const VPIteration &Instance,
2493                                                bool IfPredicateInstr) {
2494   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
2495 
2496   setDebugLocFromInst(Builder, Instr);
2497 
2498   // Does this instruction return a value ?
2499   bool IsVoidRetTy = Instr->getType()->isVoidTy();
2500 
2501   Instruction *Cloned = Instr->clone();
2502   if (!IsVoidRetTy)
2503     Cloned->setName(Instr->getName() + ".cloned");
2504 
2505   // Replace the operands of the cloned instructions with their scalar
2506   // equivalents in the new loop.
2507   for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
2508     auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
2509     Cloned->setOperand(op, NewOp);
2510   }
2511   addNewMetadata(Cloned, Instr);
2512 
2513   // Place the cloned scalar in the new loop.
2514   Builder.Insert(Cloned);
2515 
2516   // Add the cloned scalar to the scalar map entry.
2517   VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
2518 
2519   // If we just cloned a new assumption, add it the assumption cache.
2520   if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
2521     if (II->getIntrinsicID() == Intrinsic::assume)
2522       AC->registerAssumption(II);
2523 
2524   // End if-block.
2525   if (IfPredicateInstr)
2526     PredicatedInstructions.push_back(Cloned);
2527 }
2528 
createInductionVariable(Loop * L,Value * Start,Value * End,Value * Step,Instruction * DL)2529 PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
2530                                                       Value *End, Value *Step,
2531                                                       Instruction *DL) {
2532   BasicBlock *Header = L->getHeader();
2533   BasicBlock *Latch = L->getLoopLatch();
2534   // As we're just creating this loop, it's possible no latch exists
2535   // yet. If so, use the header as this will be a single block loop.
2536   if (!Latch)
2537     Latch = Header;
2538 
2539   IRBuilder<> Builder(&*Header->getFirstInsertionPt());
2540   Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
2541   setDebugLocFromInst(Builder, OldInst);
2542   auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
2543 
2544   Builder.SetInsertPoint(Latch->getTerminator());
2545   setDebugLocFromInst(Builder, OldInst);
2546 
2547   // Create i+1 and fill the PHINode.
2548   Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
2549   Induction->addIncoming(Start, L->getLoopPreheader());
2550   Induction->addIncoming(Next, Latch);
2551   // Create the compare.
2552   Value *ICmp = Builder.CreateICmpEQ(Next, End);
2553   Builder.CreateCondBr(ICmp, L->getExitBlock(), Header);
2554 
2555   // Now we have two terminators. Remove the old one from the block.
2556   Latch->getTerminator()->eraseFromParent();
2557 
2558   return Induction;
2559 }
2560 
getOrCreateTripCount(Loop * L)2561 Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
2562   if (TripCount)
2563     return TripCount;
2564 
2565   assert(L && "Create Trip Count for null loop.");
2566   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2567   // Find the loop boundaries.
2568   ScalarEvolution *SE = PSE.getSE();
2569   const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
2570   assert(BackedgeTakenCount != SE->getCouldNotCompute() &&
2571          "Invalid loop count");
2572 
2573   Type *IdxTy = Legal->getWidestInductionType();
2574   assert(IdxTy && "No type for induction");
2575 
2576   // The exit count might have the type of i64 while the phi is i32. This can
2577   // happen if we have an induction variable that is sign extended before the
2578   // compare. The only way that we get a backedge taken count is that the
2579   // induction variable was signed and as such will not overflow. In such a case
2580   // truncation is legal.
2581   if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
2582       IdxTy->getPrimitiveSizeInBits())
2583     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
2584   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
2585 
2586   // Get the total trip count from the count by adding 1.
2587   const SCEV *ExitCount = SE->getAddExpr(
2588       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
2589 
2590   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
2591 
2592   // Expand the trip count and place the new instructions in the preheader.
2593   // Notice that the pre-header does not change, only the loop body.
2594   SCEVExpander Exp(*SE, DL, "induction");
2595 
2596   // Count holds the overall loop count (N).
2597   TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
2598                                 L->getLoopPreheader()->getTerminator());
2599 
2600   if (TripCount->getType()->isPointerTy())
2601     TripCount =
2602         CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
2603                                     L->getLoopPreheader()->getTerminator());
2604 
2605   return TripCount;
2606 }
2607 
getOrCreateVectorTripCount(Loop * L)2608 Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
2609   if (VectorTripCount)
2610     return VectorTripCount;
2611 
2612   Value *TC = getOrCreateTripCount(L);
2613   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
2614 
2615   Type *Ty = TC->getType();
2616   Constant *Step = ConstantInt::get(Ty, VF * UF);
2617 
2618   // If the tail is to be folded by masking, round the number of iterations N
2619   // up to a multiple of Step instead of rounding down. This is done by first
2620   // adding Step-1 and then rounding down. Note that it's ok if this addition
2621   // overflows: the vector induction variable will eventually wrap to zero given
2622   // that it starts at zero and its Step is a power of two; the loop will then
2623   // exit, with the last early-exit vector comparison also producing all-true.
2624   if (Cost->foldTailByMasking()) {
2625     assert(isPowerOf2_32(VF * UF) &&
2626            "VF*UF must be a power of 2 when folding tail by masking");
2627     TC = Builder.CreateAdd(TC, ConstantInt::get(Ty, VF * UF - 1), "n.rnd.up");
2628   }
2629 
2630   // Now we need to generate the expression for the part of the loop that the
2631   // vectorized body will execute. This is equal to N - (N % Step) if scalar
2632   // iterations are not required for correctness, or N - Step, otherwise. Step
2633   // is equal to the vectorization factor (number of SIMD elements) times the
2634   // unroll factor (number of SIMD instructions).
2635   Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
2636 
2637   // If there is a non-reversed interleaved group that may speculatively access
2638   // memory out-of-bounds, we need to ensure that there will be at least one
2639   // iteration of the scalar epilogue loop. Thus, if the step evenly divides
2640   // the trip count, we set the remainder to be equal to the step. If the step
2641   // does not evenly divide the trip count, no adjustment is necessary since
2642   // there will already be scalar iterations. Note that the minimum iterations
2643   // check ensures that N >= Step.
2644   if (VF > 1 && Cost->requiresScalarEpilogue()) {
2645     auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
2646     R = Builder.CreateSelect(IsZero, Step, R);
2647   }
2648 
2649   VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
2650 
2651   return VectorTripCount;
2652 }
2653 
createBitOrPointerCast(Value * V,VectorType * DstVTy,const DataLayout & DL)2654 Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
2655                                                    const DataLayout &DL) {
2656   // Verify that V is a vector type with same number of elements as DstVTy.
2657   unsigned VF = DstVTy->getNumElements();
2658   VectorType *SrcVecTy = cast<VectorType>(V->getType());
2659   assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
2660   Type *SrcElemTy = SrcVecTy->getElementType();
2661   Type *DstElemTy = DstVTy->getElementType();
2662   assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
2663          "Vector elements must have same size");
2664 
2665   // Do a direct cast if element types are castable.
2666   if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
2667     return Builder.CreateBitOrPointerCast(V, DstVTy);
2668   }
2669   // V cannot be directly casted to desired vector type.
2670   // May happen when V is a floating point vector but DstVTy is a vector of
2671   // pointers or vice-versa. Handle this using a two-step bitcast using an
2672   // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
2673   assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
2674          "Only one type should be a pointer type");
2675   assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
2676          "Only one type should be a floating point type");
2677   Type *IntTy =
2678       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
2679   VectorType *VecIntTy = VectorType::get(IntTy, VF);
2680   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
2681   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
2682 }
2683 
emitMinimumIterationCountCheck(Loop * L,BasicBlock * Bypass)2684 void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
2685                                                          BasicBlock *Bypass) {
2686   Value *Count = getOrCreateTripCount(L);
2687   // Reuse existing vector loop preheader for TC checks.
2688   // Note that new preheader block is generated for vector loop.
2689   BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2690   IRBuilder<> Builder(TCCheckBlock->getTerminator());
2691 
2692   // Generate code to check if the loop's trip count is less than VF * UF, or
2693   // equal to it in case a scalar epilogue is required; this implies that the
2694   // vector trip count is zero. This check also covers the case where adding one
2695   // to the backedge-taken count overflowed leading to an incorrect trip count
2696   // of zero. In this case we will also jump to the scalar loop.
2697   auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
2698                                           : ICmpInst::ICMP_ULT;
2699 
2700   // If tail is to be folded, vector loop takes care of all iterations.
2701   Value *CheckMinIters = Builder.getFalse();
2702   if (!Cost->foldTailByMasking())
2703     CheckMinIters = Builder.CreateICmp(
2704         P, Count, ConstantInt::get(Count->getType(), VF * UF),
2705         "min.iters.check");
2706 
2707   // Create new preheader for vector loop.
2708   LoopVectorPreHeader =
2709       SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
2710                  "vector.ph");
2711 
2712   assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
2713                                DT->getNode(Bypass)->getIDom()) &&
2714          "TC check is expected to dominate Bypass");
2715 
2716   // Update dominator for Bypass & LoopExit.
2717   DT->changeImmediateDominator(Bypass, TCCheckBlock);
2718   DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
2719 
2720   ReplaceInstWithInst(
2721       TCCheckBlock->getTerminator(),
2722       BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
2723   LoopBypassBlocks.push_back(TCCheckBlock);
2724 }
2725 
emitSCEVChecks(Loop * L,BasicBlock * Bypass)2726 void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
2727   // Reuse existing vector loop preheader for SCEV checks.
2728   // Note that new preheader block is generated for vector loop.
2729   BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
2730 
2731   // Generate the code to check that the SCEV assumptions that we made.
2732   // We want the new basic block to start at the first instruction in a
2733   // sequence of instructions that form a check.
2734   SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
2735                    "scev.check");
2736   Value *SCEVCheck = Exp.expandCodeForPredicate(
2737       &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
2738 
2739   if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
2740     if (C->isZero())
2741       return;
2742 
2743   assert(!SCEVCheckBlock->getParent()->hasOptSize() &&
2744          "Cannot SCEV check stride or overflow when optimizing for size");
2745 
2746   SCEVCheckBlock->setName("vector.scevcheck");
2747   // Create new preheader for vector loop.
2748   LoopVectorPreHeader =
2749       SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
2750                  nullptr, "vector.ph");
2751 
2752   // Update dominator only if this is first RT check.
2753   if (LoopBypassBlocks.empty()) {
2754     DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2755     DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
2756   }
2757 
2758   ReplaceInstWithInst(
2759       SCEVCheckBlock->getTerminator(),
2760       BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
2761   LoopBypassBlocks.push_back(SCEVCheckBlock);
2762   AddedSafetyChecks = true;
2763 }
2764 
emitMemRuntimeChecks(Loop * L,BasicBlock * Bypass)2765 void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
2766   // VPlan-native path does not do any analysis for runtime checks currently.
2767   if (EnableVPlanNativePath)
2768     return;
2769 
2770   // Reuse existing vector loop preheader for runtime memory checks.
2771   // Note that new preheader block is generated for vector loop.
2772   BasicBlock *const MemCheckBlock = L->getLoopPreheader();
2773 
2774   // Generate the code that checks in runtime if arrays overlap. We put the
2775   // checks into a separate block to make the more common case of few elements
2776   // faster.
2777   Instruction *FirstCheckInst;
2778   Instruction *MemRuntimeCheck;
2779   std::tie(FirstCheckInst, MemRuntimeCheck) =
2780       Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
2781   if (!MemRuntimeCheck)
2782     return;
2783 
2784   if (MemCheckBlock->getParent()->hasOptSize()) {
2785     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
2786            "Cannot emit memory checks when optimizing for size, unless forced "
2787            "to vectorize.");
2788     ORE->emit([&]() {
2789       return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
2790                                         L->getStartLoc(), L->getHeader())
2791              << "Code-size may be reduced by not forcing "
2792                 "vectorization, or by source-code modifications "
2793                 "eliminating the need for runtime checks "
2794                 "(e.g., adding 'restrict').";
2795     });
2796   }
2797 
2798   MemCheckBlock->setName("vector.memcheck");
2799   // Create new preheader for vector loop.
2800   LoopVectorPreHeader =
2801       SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
2802                  "vector.ph");
2803 
2804   // Update dominator only if this is first RT check.
2805   if (LoopBypassBlocks.empty()) {
2806     DT->changeImmediateDominator(Bypass, MemCheckBlock);
2807     DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
2808   }
2809 
2810   ReplaceInstWithInst(
2811       MemCheckBlock->getTerminator(),
2812       BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheck));
2813   LoopBypassBlocks.push_back(MemCheckBlock);
2814   AddedSafetyChecks = true;
2815 
2816   // We currently don't use LoopVersioning for the actual loop cloning but we
2817   // still use it to add the noalias metadata.
2818   LVer = std::make_unique<LoopVersioning>(*Legal->getLAI(), OrigLoop, LI, DT,
2819                                           PSE.getSE());
2820   LVer->prepareNoAliasMetadata();
2821 }
2822 
emitTransformedIndex(IRBuilder<> & B,Value * Index,ScalarEvolution * SE,const DataLayout & DL,const InductionDescriptor & ID) const2823 Value *InnerLoopVectorizer::emitTransformedIndex(
2824     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
2825     const InductionDescriptor &ID) const {
2826 
2827   SCEVExpander Exp(*SE, DL, "induction");
2828   auto Step = ID.getStep();
2829   auto StartValue = ID.getStartValue();
2830   assert(Index->getType() == Step->getType() &&
2831          "Index type does not match StepValue type");
2832 
2833   // Note: the IR at this point is broken. We cannot use SE to create any new
2834   // SCEV and then expand it, hoping that SCEV's simplification will give us
2835   // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2836   // lead to various SCEV crashes. So all we can do is to use builder and rely
2837   // on InstCombine for future simplifications. Here we handle some trivial
2838   // cases only.
2839   auto CreateAdd = [&B](Value *X, Value *Y) {
2840     assert(X->getType() == Y->getType() && "Types don't match!");
2841     if (auto *CX = dyn_cast<ConstantInt>(X))
2842       if (CX->isZero())
2843         return Y;
2844     if (auto *CY = dyn_cast<ConstantInt>(Y))
2845       if (CY->isZero())
2846         return X;
2847     return B.CreateAdd(X, Y);
2848   };
2849 
2850   auto CreateMul = [&B](Value *X, Value *Y) {
2851     assert(X->getType() == Y->getType() && "Types don't match!");
2852     if (auto *CX = dyn_cast<ConstantInt>(X))
2853       if (CX->isOne())
2854         return Y;
2855     if (auto *CY = dyn_cast<ConstantInt>(Y))
2856       if (CY->isOne())
2857         return X;
2858     return B.CreateMul(X, Y);
2859   };
2860 
2861   switch (ID.getKind()) {
2862   case InductionDescriptor::IK_IntInduction: {
2863     assert(Index->getType() == StartValue->getType() &&
2864            "Index type does not match StartValue type");
2865     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
2866       return B.CreateSub(StartValue, Index);
2867     auto *Offset = CreateMul(
2868         Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
2869     return CreateAdd(StartValue, Offset);
2870   }
2871   case InductionDescriptor::IK_PtrInduction: {
2872     assert(isa<SCEVConstant>(Step) &&
2873            "Expected constant step for pointer induction");
2874     return B.CreateGEP(
2875         StartValue->getType()->getPointerElementType(), StartValue,
2876         CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
2877                                            &*B.GetInsertPoint())));
2878   }
2879   case InductionDescriptor::IK_FpInduction: {
2880     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2881     auto InductionBinOp = ID.getInductionBinOp();
2882     assert(InductionBinOp &&
2883            (InductionBinOp->getOpcode() == Instruction::FAdd ||
2884             InductionBinOp->getOpcode() == Instruction::FSub) &&
2885            "Original bin op should be defined for FP induction");
2886 
2887     Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
2888 
2889     // Floating point operations had to be 'fast' to enable the induction.
2890     FastMathFlags Flags;
2891     Flags.setFast();
2892 
2893     Value *MulExp = B.CreateFMul(StepValue, Index);
2894     if (isa<Instruction>(MulExp))
2895       // We have to check, the MulExp may be a constant.
2896       cast<Instruction>(MulExp)->setFastMathFlags(Flags);
2897 
2898     Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
2899                                "induction");
2900     if (isa<Instruction>(BOp))
2901       cast<Instruction>(BOp)->setFastMathFlags(Flags);
2902 
2903     return BOp;
2904   }
2905   case InductionDescriptor::IK_NoInduction:
2906     return nullptr;
2907   }
2908   llvm_unreachable("invalid enum");
2909 }
2910 
createVectorizedLoopSkeleton()2911 BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2912   /*
2913    In this function we generate a new loop. The new loop will contain
2914    the vectorized instructions while the old loop will continue to run the
2915    scalar remainder.
2916 
2917        [ ] <-- loop iteration number check.
2918     /   |
2919    /    v
2920   |    [ ] <-- vector loop bypass (may consist of multiple blocks).
2921   |  /  |
2922   | /   v
2923   ||   [ ]     <-- vector pre header.
2924   |/    |
2925   |     v
2926   |    [  ] \
2927   |    [  ]_|   <-- vector loop.
2928   |     |
2929   |     v
2930   |   -[ ]   <--- middle-block.
2931   |  /  |
2932   | /   v
2933   -|- >[ ]     <--- new preheader.
2934    |    |
2935    |    v
2936    |   [ ] \
2937    |   [ ]_|   <-- old scalar loop to handle remainder.
2938     \   |
2939      \  v
2940       >[ ]     <-- exit block.
2941    ...
2942    */
2943 
2944   MDNode *OrigLoopID = OrigLoop->getLoopID();
2945 
2946   // Some loops have a single integer induction variable, while other loops
2947   // don't. One example is c++ iterators that often have multiple pointer
2948   // induction variables. In the code below we also support a case where we
2949   // don't have a single induction variable.
2950   //
2951   // We try to obtain an induction variable from the original loop as hard
2952   // as possible. However if we don't find one that:
2953   //   - is an integer
2954   //   - counts from zero, stepping by one
2955   //   - is the size of the widest induction variable type
2956   // then we create a new one.
2957   OldInduction = Legal->getPrimaryInduction();
2958   Type *IdxTy = Legal->getWidestInductionType();
2959 
2960   // Split the single block loop into the two loop structure described above.
2961   LoopScalarBody = OrigLoop->getHeader();
2962   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2963   LoopExitBlock = OrigLoop->getExitBlock();
2964   assert(LoopExitBlock && "Must have an exit block");
2965   assert(LoopVectorPreHeader && "Invalid loop structure");
2966 
2967   LoopMiddleBlock =
2968       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2969                  LI, nullptr, "middle.block");
2970   LoopScalarPreHeader =
2971       SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
2972                  nullptr, "scalar.ph");
2973   // We intentionally don't let SplitBlock to update LoopInfo since
2974   // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
2975   // LoopVectorBody is explicitly added to the correct place few lines later.
2976   LoopVectorBody =
2977       SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
2978                  nullptr, nullptr, "vector.body");
2979 
2980   // Update dominator for loop exit.
2981   DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
2982 
2983   // Create and register the new vector loop.
2984   Loop *Lp = LI->AllocateLoop();
2985   Loop *ParentLoop = OrigLoop->getParentLoop();
2986 
2987   // Insert the new loop into the loop nest and register the new basic blocks
2988   // before calling any utilities such as SCEV that require valid LoopInfo.
2989   if (ParentLoop) {
2990     ParentLoop->addChildLoop(Lp);
2991   } else {
2992     LI->addTopLevelLoop(Lp);
2993   }
2994   Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
2995 
2996   // Find the loop boundaries.
2997   Value *Count = getOrCreateTripCount(Lp);
2998 
2999   Value *StartIdx = ConstantInt::get(IdxTy, 0);
3000 
3001   // Now, compare the new count to zero. If it is zero skip the vector loop and
3002   // jump to the scalar loop. This check also covers the case where the
3003   // backedge-taken count is uint##_max: adding one to it will overflow leading
3004   // to an incorrect trip count of zero. In this (rare) case we will also jump
3005   // to the scalar loop.
3006   emitMinimumIterationCountCheck(Lp, LoopScalarPreHeader);
3007 
3008   // Generate the code to check any assumptions that we've made for SCEV
3009   // expressions.
3010   emitSCEVChecks(Lp, LoopScalarPreHeader);
3011 
3012   // Generate the code that checks in runtime if arrays overlap. We put the
3013   // checks into a separate block to make the more common case of few elements
3014   // faster.
3015   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
3016 
3017   // Generate the induction variable.
3018   // The loop step is equal to the vectorization factor (num of SIMD elements)
3019   // times the unroll factor (num of SIMD instructions).
3020   Value *CountRoundDown = getOrCreateVectorTripCount(Lp);
3021   Constant *Step = ConstantInt::get(IdxTy, VF * UF);
3022   Induction =
3023       createInductionVariable(Lp, StartIdx, CountRoundDown, Step,
3024                               getDebugLocFromInstOrOperands(OldInduction));
3025 
3026   // We are going to resume the execution of the scalar loop.
3027   // Go over all of the induction variables that we found and fix the
3028   // PHIs that are left in the scalar version of the loop.
3029   // The starting values of PHI nodes depend on the counter of the last
3030   // iteration in the vectorized loop.
3031   // If we come from a bypass edge then we need to start from the original
3032   // start value.
3033 
3034   // This variable saves the new starting index for the scalar loop. It is used
3035   // to test if there are any tail iterations left once the vector loop has
3036   // completed.
3037   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
3038   for (auto &InductionEntry : *List) {
3039     PHINode *OrigPhi = InductionEntry.first;
3040     InductionDescriptor II = InductionEntry.second;
3041 
3042     // Create phi nodes to merge from the  backedge-taken check block.
3043     PHINode *BCResumeVal =
3044         PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
3045                         LoopScalarPreHeader->getTerminator());
3046     // Copy original phi DL over to the new one.
3047     BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
3048     Value *&EndValue = IVEndValues[OrigPhi];
3049     if (OrigPhi == OldInduction) {
3050       // We know what the end value is.
3051       EndValue = CountRoundDown;
3052     } else {
3053       IRBuilder<> B(Lp->getLoopPreheader()->getTerminator());
3054       Type *StepType = II.getStep()->getType();
3055       Instruction::CastOps CastOp =
3056           CastInst::getCastOpcode(CountRoundDown, true, StepType, true);
3057       Value *CRD = B.CreateCast(CastOp, CountRoundDown, StepType, "cast.crd");
3058       const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
3059       EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
3060       EndValue->setName("ind.end");
3061     }
3062 
3063     // The new PHI merges the original incoming value, in case of a bypass,
3064     // or the value at the end of the vectorized loop.
3065     BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
3066 
3067     // Fix the scalar body counter (PHI node).
3068     // The old induction's phi node in the scalar body needs the truncated
3069     // value.
3070     for (BasicBlock *BB : LoopBypassBlocks)
3071       BCResumeVal->addIncoming(II.getStartValue(), BB);
3072     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
3073   }
3074 
3075   // We need the OrigLoop (scalar loop part) latch terminator to help
3076   // produce correct debug info for the middle block BB instructions.
3077   // The legality check stage guarantees that the loop will have a single
3078   // latch.
3079   assert(isa<BranchInst>(OrigLoop->getLoopLatch()->getTerminator()) &&
3080          "Scalar loop latch terminator isn't a branch");
3081   BranchInst *ScalarLatchBr =
3082       cast<BranchInst>(OrigLoop->getLoopLatch()->getTerminator());
3083 
3084   // Add a check in the middle block to see if we have completed
3085   // all of the iterations in the first vector loop.
3086   // If (N - N%VF) == N, then we *don't* need to run the remainder.
3087   // If tail is to be folded, we know we don't need to run the remainder.
3088   Value *CmpN = Builder.getTrue();
3089   if (!Cost->foldTailByMasking()) {
3090     CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
3091                            CountRoundDown, "cmp.n",
3092                            LoopMiddleBlock->getTerminator());
3093 
3094     // Here we use the same DebugLoc as the scalar loop latch branch instead
3095     // of the corresponding compare because they may have ended up with
3096     // different line numbers and we want to avoid awkward line stepping while
3097     // debugging. Eg. if the compare has got a line number inside the loop.
3098     cast<Instruction>(CmpN)->setDebugLoc(ScalarLatchBr->getDebugLoc());
3099   }
3100 
3101   BranchInst *BrInst =
3102       BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, CmpN);
3103   BrInst->setDebugLoc(ScalarLatchBr->getDebugLoc());
3104   ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
3105 
3106   // Get ready to start creating new instructions into the vectorized body.
3107   assert(LoopVectorPreHeader == Lp->getLoopPreheader() &&
3108          "Inconsistent vector loop preheader");
3109   Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
3110 
3111   Optional<MDNode *> VectorizedLoopID =
3112       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
3113                                       LLVMLoopVectorizeFollowupVectorized});
3114   if (VectorizedLoopID.hasValue()) {
3115     Lp->setLoopID(VectorizedLoopID.getValue());
3116 
3117     // Do not setAlreadyVectorized if loop attributes have been defined
3118     // explicitly.
3119     return LoopVectorPreHeader;
3120   }
3121 
3122   // Keep all loop hints from the original loop on the vector loop (we'll
3123   // replace the vectorizer-specific hints below).
3124   if (MDNode *LID = OrigLoop->getLoopID())
3125     Lp->setLoopID(LID);
3126 
3127   LoopVectorizeHints Hints(Lp, true, *ORE);
3128   Hints.setAlreadyVectorized();
3129 
3130 #ifdef EXPENSIVE_CHECKS
3131   assert(DT->verify(DominatorTree::VerificationLevel::Fast));
3132   LI->verify(*DT);
3133 #endif
3134 
3135   return LoopVectorPreHeader;
3136 }
3137 
3138 // Fix up external users of the induction variable. At this point, we are
3139 // in LCSSA form, with all external PHIs that use the IV having one input value,
3140 // coming from the remainder loop. We need those PHIs to also have a correct
3141 // value for the IV when arriving directly from the middle block.
fixupIVUsers(PHINode * OrigPhi,const InductionDescriptor & II,Value * CountRoundDown,Value * EndValue,BasicBlock * MiddleBlock)3142 void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
3143                                        const InductionDescriptor &II,
3144                                        Value *CountRoundDown, Value *EndValue,
3145                                        BasicBlock *MiddleBlock) {
3146   // There are two kinds of external IV usages - those that use the value
3147   // computed in the last iteration (the PHI) and those that use the penultimate
3148   // value (the value that feeds into the phi from the loop latch).
3149   // We allow both, but they, obviously, have different values.
3150 
3151   assert(OrigLoop->getExitBlock() && "Expected a single exit block");
3152 
3153   DenseMap<Value *, Value *> MissingVals;
3154 
3155   // An external user of the last iteration's value should see the value that
3156   // the remainder loop uses to initialize its own IV.
3157   Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
3158   for (User *U : PostInc->users()) {
3159     Instruction *UI = cast<Instruction>(U);
3160     if (!OrigLoop->contains(UI)) {
3161       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3162       MissingVals[UI] = EndValue;
3163     }
3164   }
3165 
3166   // An external user of the penultimate value need to see EndValue - Step.
3167   // The simplest way to get this is to recompute it from the constituent SCEVs,
3168   // that is Start + (Step * (CRD - 1)).
3169   for (User *U : OrigPhi->users()) {
3170     auto *UI = cast<Instruction>(U);
3171     if (!OrigLoop->contains(UI)) {
3172       const DataLayout &DL =
3173           OrigLoop->getHeader()->getModule()->getDataLayout();
3174       assert(isa<PHINode>(UI) && "Expected LCSSA form");
3175 
3176       IRBuilder<> B(MiddleBlock->getTerminator());
3177       Value *CountMinusOne = B.CreateSub(
3178           CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
3179       Value *CMO =
3180           !II.getStep()->getType()->isIntegerTy()
3181               ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
3182                              II.getStep()->getType())
3183               : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
3184       CMO->setName("cast.cmo");
3185       Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
3186       Escape->setName("ind.escape");
3187       MissingVals[UI] = Escape;
3188     }
3189   }
3190 
3191   for (auto &I : MissingVals) {
3192     PHINode *PHI = cast<PHINode>(I.first);
3193     // One corner case we have to handle is two IVs "chasing" each-other,
3194     // that is %IV2 = phi [...], [ %IV1, %latch ]
3195     // In this case, if IV1 has an external use, we need to avoid adding both
3196     // "last value of IV1" and "penultimate value of IV2". So, verify that we
3197     // don't already have an incoming value for the middle block.
3198     if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
3199       PHI->addIncoming(I.second, MiddleBlock);
3200   }
3201 }
3202 
3203 namespace {
3204 
3205 struct CSEDenseMapInfo {
canHandle__anon27d292930911::CSEDenseMapInfo3206   static bool canHandle(const Instruction *I) {
3207     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
3208            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
3209   }
3210 
getEmptyKey__anon27d292930911::CSEDenseMapInfo3211   static inline Instruction *getEmptyKey() {
3212     return DenseMapInfo<Instruction *>::getEmptyKey();
3213   }
3214 
getTombstoneKey__anon27d292930911::CSEDenseMapInfo3215   static inline Instruction *getTombstoneKey() {
3216     return DenseMapInfo<Instruction *>::getTombstoneKey();
3217   }
3218 
getHashValue__anon27d292930911::CSEDenseMapInfo3219   static unsigned getHashValue(const Instruction *I) {
3220     assert(canHandle(I) && "Unknown instruction!");
3221     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
3222                                                            I->value_op_end()));
3223   }
3224 
isEqual__anon27d292930911::CSEDenseMapInfo3225   static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
3226     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
3227         LHS == getTombstoneKey() || RHS == getTombstoneKey())
3228       return LHS == RHS;
3229     return LHS->isIdenticalTo(RHS);
3230   }
3231 };
3232 
3233 } // end anonymous namespace
3234 
3235 ///Perform cse of induction variable instructions.
cse(BasicBlock * BB)3236 static void cse(BasicBlock *BB) {
3237   // Perform simple cse.
3238   SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
3239   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
3240     Instruction *In = &*I++;
3241 
3242     if (!CSEDenseMapInfo::canHandle(In))
3243       continue;
3244 
3245     // Check if we can replace this instruction with any of the
3246     // visited instructions.
3247     if (Instruction *V = CSEMap.lookup(In)) {
3248       In->replaceAllUsesWith(V);
3249       In->eraseFromParent();
3250       continue;
3251     }
3252 
3253     CSEMap[In] = In;
3254   }
3255 }
3256 
getVectorCallCost(CallInst * CI,unsigned VF,bool & NeedToScalarize)3257 unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
3258                                                        unsigned VF,
3259                                                        bool &NeedToScalarize) {
3260   Function *F = CI->getCalledFunction();
3261   StringRef FnName = CI->getCalledFunction()->getName();
3262   Type *ScalarRetTy = CI->getType();
3263   SmallVector<Type *, 4> Tys, ScalarTys;
3264   for (auto &ArgOp : CI->arg_operands())
3265     ScalarTys.push_back(ArgOp->getType());
3266 
3267   // Estimate cost of scalarized vector call. The source operands are assumed
3268   // to be vectors, so we need to extract individual elements from there,
3269   // execute VF scalar calls, and then gather the result into the vector return
3270   // value.
3271   unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
3272   if (VF == 1)
3273     return ScalarCallCost;
3274 
3275   // Compute corresponding vector type for return value and arguments.
3276   Type *RetTy = ToVectorTy(ScalarRetTy, VF);
3277   for (Type *ScalarTy : ScalarTys)
3278     Tys.push_back(ToVectorTy(ScalarTy, VF));
3279 
3280   // Compute costs of unpacking argument values for the scalar calls and
3281   // packing the return values to a vector.
3282   unsigned ScalarizationCost = getScalarizationOverhead(CI, VF);
3283 
3284   unsigned Cost = ScalarCallCost * VF + ScalarizationCost;
3285 
3286   // If we can't emit a vector call for this function, then the currently found
3287   // cost is the cost we need to return.
3288   NeedToScalarize = true;
3289   if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
3290     return Cost;
3291 
3292   // If the corresponding vector cost is cheaper, return its cost.
3293   unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
3294   if (VectorCallCost < Cost) {
3295     NeedToScalarize = false;
3296     return VectorCallCost;
3297   }
3298   return Cost;
3299 }
3300 
getVectorIntrinsicCost(CallInst * CI,unsigned VF)3301 unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
3302                                                             unsigned VF) {
3303   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
3304   assert(ID && "Expected intrinsic call!");
3305 
3306   FastMathFlags FMF;
3307   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
3308     FMF = FPMO->getFastMathFlags();
3309 
3310   SmallVector<Value *, 4> Operands(CI->arg_operands());
3311   return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
3312 }
3313 
smallestIntegerVectorType(Type * T1,Type * T2)3314 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
3315   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3316   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3317   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
3318 }
largestIntegerVectorType(Type * T1,Type * T2)3319 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
3320   auto *I1 = cast<IntegerType>(T1->getVectorElementType());
3321   auto *I2 = cast<IntegerType>(T2->getVectorElementType());
3322   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
3323 }
3324 
truncateToMinimalBitwidths()3325 void InnerLoopVectorizer::truncateToMinimalBitwidths() {
3326   // For every instruction `I` in MinBWs, truncate the operands, create a
3327   // truncated version of `I` and reextend its result. InstCombine runs
3328   // later and will remove any ext/trunc pairs.
3329   SmallPtrSet<Value *, 4> Erased;
3330   for (const auto &KV : Cost->getMinimalBitwidths()) {
3331     // If the value wasn't vectorized, we must maintain the original scalar
3332     // type. The absence of the value from VectorLoopValueMap indicates that it
3333     // wasn't vectorized.
3334     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3335       continue;
3336     for (unsigned Part = 0; Part < UF; ++Part) {
3337       Value *I = getOrCreateVectorValue(KV.first, Part);
3338       if (Erased.find(I) != Erased.end() || I->use_empty() ||
3339           !isa<Instruction>(I))
3340         continue;
3341       Type *OriginalTy = I->getType();
3342       Type *ScalarTruncatedTy =
3343           IntegerType::get(OriginalTy->getContext(), KV.second);
3344       Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
3345                                           OriginalTy->getVectorNumElements());
3346       if (TruncatedTy == OriginalTy)
3347         continue;
3348 
3349       IRBuilder<> B(cast<Instruction>(I));
3350       auto ShrinkOperand = [&](Value *V) -> Value * {
3351         if (auto *ZI = dyn_cast<ZExtInst>(V))
3352           if (ZI->getSrcTy() == TruncatedTy)
3353             return ZI->getOperand(0);
3354         return B.CreateZExtOrTrunc(V, TruncatedTy);
3355       };
3356 
3357       // The actual instruction modification depends on the instruction type,
3358       // unfortunately.
3359       Value *NewI = nullptr;
3360       if (auto *BO = dyn_cast<BinaryOperator>(I)) {
3361         NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
3362                              ShrinkOperand(BO->getOperand(1)));
3363 
3364         // Any wrapping introduced by shrinking this operation shouldn't be
3365         // considered undefined behavior. So, we can't unconditionally copy
3366         // arithmetic wrapping flags to NewI.
3367         cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
3368       } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
3369         NewI =
3370             B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
3371                          ShrinkOperand(CI->getOperand(1)));
3372       } else if (auto *SI = dyn_cast<SelectInst>(I)) {
3373         NewI = B.CreateSelect(SI->getCondition(),
3374                               ShrinkOperand(SI->getTrueValue()),
3375                               ShrinkOperand(SI->getFalseValue()));
3376       } else if (auto *CI = dyn_cast<CastInst>(I)) {
3377         switch (CI->getOpcode()) {
3378         default:
3379           llvm_unreachable("Unhandled cast!");
3380         case Instruction::Trunc:
3381           NewI = ShrinkOperand(CI->getOperand(0));
3382           break;
3383         case Instruction::SExt:
3384           NewI = B.CreateSExtOrTrunc(
3385               CI->getOperand(0),
3386               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3387           break;
3388         case Instruction::ZExt:
3389           NewI = B.CreateZExtOrTrunc(
3390               CI->getOperand(0),
3391               smallestIntegerVectorType(OriginalTy, TruncatedTy));
3392           break;
3393         }
3394       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
3395         auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
3396         auto *O0 = B.CreateZExtOrTrunc(
3397             SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
3398         auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
3399         auto *O1 = B.CreateZExtOrTrunc(
3400             SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
3401 
3402         NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
3403       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
3404         // Don't do anything with the operands, just extend the result.
3405         continue;
3406       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
3407         auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
3408         auto *O0 = B.CreateZExtOrTrunc(
3409             IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3410         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
3411         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
3412       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
3413         auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
3414         auto *O0 = B.CreateZExtOrTrunc(
3415             EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
3416         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
3417       } else {
3418         // If we don't know what to do, be conservative and don't do anything.
3419         continue;
3420       }
3421 
3422       // Lastly, extend the result.
3423       NewI->takeName(cast<Instruction>(I));
3424       Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
3425       I->replaceAllUsesWith(Res);
3426       cast<Instruction>(I)->eraseFromParent();
3427       Erased.insert(I);
3428       VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
3429     }
3430   }
3431 
3432   // We'll have created a bunch of ZExts that are now parentless. Clean up.
3433   for (const auto &KV : Cost->getMinimalBitwidths()) {
3434     // If the value wasn't vectorized, we must maintain the original scalar
3435     // type. The absence of the value from VectorLoopValueMap indicates that it
3436     // wasn't vectorized.
3437     if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
3438       continue;
3439     for (unsigned Part = 0; Part < UF; ++Part) {
3440       Value *I = getOrCreateVectorValue(KV.first, Part);
3441       ZExtInst *Inst = dyn_cast<ZExtInst>(I);
3442       if (Inst && Inst->use_empty()) {
3443         Value *NewI = Inst->getOperand(0);
3444         Inst->eraseFromParent();
3445         VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
3446       }
3447     }
3448   }
3449 }
3450 
fixVectorizedLoop()3451 void InnerLoopVectorizer::fixVectorizedLoop() {
3452   // Insert truncates and extends for any truncated instructions as hints to
3453   // InstCombine.
3454   if (VF > 1)
3455     truncateToMinimalBitwidths();
3456 
3457   // Fix widened non-induction PHIs by setting up the PHI operands.
3458   if (OrigPHIsToFix.size()) {
3459     assert(EnableVPlanNativePath &&
3460            "Unexpected non-induction PHIs for fixup in non VPlan-native path");
3461     fixNonInductionPHIs();
3462   }
3463 
3464   // At this point every instruction in the original loop is widened to a
3465   // vector form. Now we need to fix the recurrences in the loop. These PHI
3466   // nodes are currently empty because we did not want to introduce cycles.
3467   // This is the second stage of vectorizing recurrences.
3468   fixCrossIterationPHIs();
3469 
3470   // Forget the original basic block.
3471   PSE.getSE()->forgetLoop(OrigLoop);
3472 
3473   // Fix-up external users of the induction variables.
3474   for (auto &Entry : *Legal->getInductionVars())
3475     fixupIVUsers(Entry.first, Entry.second,
3476                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3477                  IVEndValues[Entry.first], LoopMiddleBlock);
3478 
3479   fixLCSSAPHIs();
3480   for (Instruction *PI : PredicatedInstructions)
3481     sinkScalarOperands(&*PI);
3482 
3483   // Remove redundant induction instructions.
3484   cse(LoopVectorBody);
3485 }
3486 
fixCrossIterationPHIs()3487 void InnerLoopVectorizer::fixCrossIterationPHIs() {
3488   // In order to support recurrences we need to be able to vectorize Phi nodes.
3489   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
3490   // stage #2: We now need to fix the recurrences by adding incoming edges to
3491   // the currently empty PHI nodes. At this point every instruction in the
3492   // original loop is widened to a vector form so we can use them to construct
3493   // the incoming edges.
3494   for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
3495     // Handle first-order recurrences and reductions that need to be fixed.
3496     if (Legal->isFirstOrderRecurrence(&Phi))
3497       fixFirstOrderRecurrence(&Phi);
3498     else if (Legal->isReductionVariable(&Phi))
3499       fixReduction(&Phi);
3500   }
3501 }
3502 
fixFirstOrderRecurrence(PHINode * Phi)3503 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
3504   // This is the second phase of vectorizing first-order recurrences. An
3505   // overview of the transformation is described below. Suppose we have the
3506   // following loop.
3507   //
3508   //   for (int i = 0; i < n; ++i)
3509   //     b[i] = a[i] - a[i - 1];
3510   //
3511   // There is a first-order recurrence on "a". For this loop, the shorthand
3512   // scalar IR looks like:
3513   //
3514   //   scalar.ph:
3515   //     s_init = a[-1]
3516   //     br scalar.body
3517   //
3518   //   scalar.body:
3519   //     i = phi [0, scalar.ph], [i+1, scalar.body]
3520   //     s1 = phi [s_init, scalar.ph], [s2, scalar.body]
3521   //     s2 = a[i]
3522   //     b[i] = s2 - s1
3523   //     br cond, scalar.body, ...
3524   //
3525   // In this example, s1 is a recurrence because it's value depends on the
3526   // previous iteration. In the first phase of vectorization, we created a
3527   // temporary value for s1. We now complete the vectorization and produce the
3528   // shorthand vector IR shown below (for VF = 4, UF = 1).
3529   //
3530   //   vector.ph:
3531   //     v_init = vector(..., ..., ..., a[-1])
3532   //     br vector.body
3533   //
3534   //   vector.body
3535   //     i = phi [0, vector.ph], [i+4, vector.body]
3536   //     v1 = phi [v_init, vector.ph], [v2, vector.body]
3537   //     v2 = a[i, i+1, i+2, i+3];
3538   //     v3 = vector(v1(3), v2(0, 1, 2))
3539   //     b[i, i+1, i+2, i+3] = v2 - v3
3540   //     br cond, vector.body, middle.block
3541   //
3542   //   middle.block:
3543   //     x = v2(3)
3544   //     br scalar.ph
3545   //
3546   //   scalar.ph:
3547   //     s_init = phi [x, middle.block], [a[-1], otherwise]
3548   //     br scalar.body
3549   //
3550   // After execution completes the vector loop, we extract the next value of
3551   // the recurrence (x) to use as the initial value in the scalar loop.
3552 
3553   // Get the original loop preheader and single loop latch.
3554   auto *Preheader = OrigLoop->getLoopPreheader();
3555   auto *Latch = OrigLoop->getLoopLatch();
3556 
3557   // Get the initial and previous values of the scalar recurrence.
3558   auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
3559   auto *Previous = Phi->getIncomingValueForBlock(Latch);
3560 
3561   // Create a vector from the initial value.
3562   auto *VectorInit = ScalarInit;
3563   if (VF > 1) {
3564     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3565     VectorInit = Builder.CreateInsertElement(
3566         UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
3567         Builder.getInt32(VF - 1), "vector.recur.init");
3568   }
3569 
3570   // We constructed a temporary phi node in the first phase of vectorization.
3571   // This phi node will eventually be deleted.
3572   Builder.SetInsertPoint(
3573       cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
3574 
3575   // Create a phi node for the new recurrence. The current value will either be
3576   // the initial value inserted into a vector or loop-varying vector value.
3577   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
3578   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
3579 
3580   // Get the vectorized previous value of the last part UF - 1. It appears last
3581   // among all unrolled iterations, due to the order of their construction.
3582   Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
3583 
3584   // Find and set the insertion point after the previous value if it is an
3585   // instruction.
3586   BasicBlock::iterator InsertPt;
3587   // Note that the previous value may have been constant-folded so it is not
3588   // guaranteed to be an instruction in the vector loop.
3589   // FIXME: Loop invariant values do not form recurrences. We should deal with
3590   //        them earlier.
3591   if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
3592     InsertPt = LoopVectorBody->getFirstInsertionPt();
3593   else {
3594     Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
3595     if (isa<PHINode>(PreviousLastPart))
3596       // If the previous value is a phi node, we should insert after all the phi
3597       // nodes in the block containing the PHI to avoid breaking basic block
3598       // verification. Note that the basic block may be different to
3599       // LoopVectorBody, in case we predicate the loop.
3600       InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
3601     else
3602       InsertPt = ++PreviousInst->getIterator();
3603   }
3604   Builder.SetInsertPoint(&*InsertPt);
3605 
3606   // We will construct a vector for the recurrence by combining the values for
3607   // the current and previous iterations. This is the required shuffle mask.
3608   SmallVector<Constant *, 8> ShuffleMask(VF);
3609   ShuffleMask[0] = Builder.getInt32(VF - 1);
3610   for (unsigned I = 1; I < VF; ++I)
3611     ShuffleMask[I] = Builder.getInt32(I + VF - 1);
3612 
3613   // The vector from which to take the initial value for the current iteration
3614   // (actual or unrolled). Initially, this is the vector phi node.
3615   Value *Incoming = VecPhi;
3616 
3617   // Shuffle the current and previous vector and update the vector parts.
3618   for (unsigned Part = 0; Part < UF; ++Part) {
3619     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
3620     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
3621     auto *Shuffle =
3622         VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
3623                                              ConstantVector::get(ShuffleMask))
3624                : Incoming;
3625     PhiPart->replaceAllUsesWith(Shuffle);
3626     cast<Instruction>(PhiPart)->eraseFromParent();
3627     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
3628     Incoming = PreviousPart;
3629   }
3630 
3631   // Fix the latch value of the new recurrence in the vector loop.
3632   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3633 
3634   // Extract the last vector element in the middle block. This will be the
3635   // initial value for the recurrence when jumping to the scalar loop.
3636   auto *ExtractForScalar = Incoming;
3637   if (VF > 1) {
3638     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3639     ExtractForScalar = Builder.CreateExtractElement(
3640         ExtractForScalar, Builder.getInt32(VF - 1), "vector.recur.extract");
3641   }
3642   // Extract the second last element in the middle block if the
3643   // Phi is used outside the loop. We need to extract the phi itself
3644   // and not the last element (the phi update in the current iteration). This
3645   // will be the value when jumping to the exit block from the LoopMiddleBlock,
3646   // when the scalar loop is not run at all.
3647   Value *ExtractForPhiUsedOutsideLoop = nullptr;
3648   if (VF > 1)
3649     ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
3650         Incoming, Builder.getInt32(VF - 2), "vector.recur.extract.for.phi");
3651   // When loop is unrolled without vectorizing, initialize
3652   // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
3653   // `Incoming`. This is analogous to the vectorized case above: extracting the
3654   // second last element when VF > 1.
3655   else if (UF > 1)
3656     ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
3657 
3658   // Fix the initial value of the original recurrence in the scalar loop.
3659   Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
3660   auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
3661   for (auto *BB : predecessors(LoopScalarPreHeader)) {
3662     auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
3663     Start->addIncoming(Incoming, BB);
3664   }
3665 
3666   Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
3667   Phi->setName("scalar.recur");
3668 
3669   // Finally, fix users of the recurrence outside the loop. The users will need
3670   // either the last value of the scalar recurrence or the last value of the
3671   // vector recurrence we extracted in the middle block. Since the loop is in
3672   // LCSSA form, we just need to find all the phi nodes for the original scalar
3673   // recurrence in the exit block, and then add an edge for the middle block.
3674   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3675     if (LCSSAPhi.getIncomingValue(0) == Phi) {
3676       LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
3677     }
3678   }
3679 }
3680 
fixReduction(PHINode * Phi)3681 void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
3682   Constant *Zero = Builder.getInt32(0);
3683 
3684   // Get it's reduction variable descriptor.
3685   assert(Legal->isReductionVariable(Phi) &&
3686          "Unable to find the reduction variable");
3687   RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
3688 
3689   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3690   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
3691   Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
3692   RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
3693     RdxDesc.getMinMaxRecurrenceKind();
3694   setDebugLocFromInst(Builder, ReductionStartValue);
3695 
3696   // We need to generate a reduction vector from the incoming scalar.
3697   // To do so, we need to generate the 'identity' vector and override
3698   // one of the elements with the incoming scalar reduction. We need
3699   // to do it in the vector-loop preheader.
3700   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
3701 
3702   // This is the vector-clone of the value that leaves the loop.
3703   Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
3704 
3705   // Find the reduction identity variable. Zero for addition, or, xor,
3706   // one for multiplication, -1 for And.
3707   Value *Identity;
3708   Value *VectorStart;
3709   if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
3710       RK == RecurrenceDescriptor::RK_FloatMinMax) {
3711     // MinMax reduction have the start value as their identify.
3712     if (VF == 1) {
3713       VectorStart = Identity = ReductionStartValue;
3714     } else {
3715       VectorStart = Identity =
3716         Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
3717     }
3718   } else {
3719     // Handle other reduction kinds:
3720     Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
3721         RK, VecTy->getScalarType());
3722     if (VF == 1) {
3723       Identity = Iden;
3724       // This vector is the Identity vector where the first element is the
3725       // incoming scalar reduction.
3726       VectorStart = ReductionStartValue;
3727     } else {
3728       Identity = ConstantVector::getSplat(VF, Iden);
3729 
3730       // This vector is the Identity vector where the first element is the
3731       // incoming scalar reduction.
3732       VectorStart =
3733         Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
3734     }
3735   }
3736 
3737   // Wrap flags are in general invalid after vectorization, clear them.
3738   clearReductionWrapFlags(RdxDesc);
3739 
3740   // Fix the vector-loop phi.
3741 
3742   // Reductions do not have to start at zero. They can start with
3743   // any loop invariant values.
3744   BasicBlock *Latch = OrigLoop->getLoopLatch();
3745   Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
3746 
3747   for (unsigned Part = 0; Part < UF; ++Part) {
3748     Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
3749     Value *Val = getOrCreateVectorValue(LoopVal, Part);
3750     // Make sure to add the reduction start value only to the
3751     // first unroll part.
3752     Value *StartVal = (Part == 0) ? VectorStart : Identity;
3753     cast<PHINode>(VecRdxPhi)->addIncoming(StartVal, LoopVectorPreHeader);
3754     cast<PHINode>(VecRdxPhi)
3755       ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
3756   }
3757 
3758   // Before each round, move the insertion point right between
3759   // the PHIs and the values we are going to write.
3760   // This allows us to write both PHINodes and the extractelement
3761   // instructions.
3762   Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3763 
3764   setDebugLocFromInst(Builder, LoopExitInst);
3765 
3766   // If tail is folded by masking, the vector value to leave the loop should be
3767   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
3768   // instead of the former.
3769   if (Cost->foldTailByMasking()) {
3770     for (unsigned Part = 0; Part < UF; ++Part) {
3771       Value *VecLoopExitInst =
3772           VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3773       Value *Sel = nullptr;
3774       for (User *U : VecLoopExitInst->users()) {
3775         if (isa<SelectInst>(U)) {
3776           assert(!Sel && "Reduction exit feeding two selects");
3777           Sel = U;
3778         } else
3779           assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
3780       }
3781       assert(Sel && "Reduction exit feeds no select");
3782       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
3783     }
3784   }
3785 
3786   // If the vector reduction can be performed in a smaller type, we truncate
3787   // then extend the loop exit value to enable InstCombine to evaluate the
3788   // entire expression in the smaller type.
3789   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
3790     Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
3791     Builder.SetInsertPoint(
3792         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
3793     VectorParts RdxParts(UF);
3794     for (unsigned Part = 0; Part < UF; ++Part) {
3795       RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3796       Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3797       Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
3798                                         : Builder.CreateZExt(Trunc, VecTy);
3799       for (Value::user_iterator UI = RdxParts[Part]->user_begin();
3800            UI != RdxParts[Part]->user_end();)
3801         if (*UI != Trunc) {
3802           (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
3803           RdxParts[Part] = Extnd;
3804         } else {
3805           ++UI;
3806         }
3807     }
3808     Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
3809     for (unsigned Part = 0; Part < UF; ++Part) {
3810       RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
3811       VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
3812     }
3813   }
3814 
3815   // Reduce all of the unrolled parts into a single vector.
3816   Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
3817   unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
3818 
3819   // The middle block terminator has already been assigned a DebugLoc here (the
3820   // OrigLoop's single latch terminator). We want the whole middle block to
3821   // appear to execute on this line because: (a) it is all compiler generated,
3822   // (b) these instructions are always executed after evaluating the latch
3823   // conditional branch, and (c) other passes may add new predecessors which
3824   // terminate on this line. This is the easiest way to ensure we don't
3825   // accidentally cause an extra step back into the loop while debugging.
3826   setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
3827   for (unsigned Part = 1; Part < UF; ++Part) {
3828     Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
3829     if (Op != Instruction::ICmp && Op != Instruction::FCmp)
3830       // Floating point operations had to be 'fast' to enable the reduction.
3831       ReducedPartRdx = addFastMathFlag(
3832           Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
3833                               ReducedPartRdx, "bin.rdx"),
3834           RdxDesc.getFastMathFlags());
3835     else
3836       ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx,
3837                                       RdxPart);
3838   }
3839 
3840   if (VF > 1) {
3841     bool NoNaN = Legal->hasFunNoNaNAttr();
3842     ReducedPartRdx =
3843         createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN);
3844     // If the reduction can be performed in a smaller type, we need to extend
3845     // the reduction to the wider type before we branch to the original loop.
3846     if (Phi->getType() != RdxDesc.getRecurrenceType())
3847       ReducedPartRdx =
3848         RdxDesc.isSigned()
3849         ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
3850         : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
3851   }
3852 
3853   // Create a phi node that merges control-flow from the backedge-taken check
3854   // block and the middle block.
3855   PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
3856                                         LoopScalarPreHeader->getTerminator());
3857   for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
3858     BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
3859   BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
3860 
3861   // Now, we need to fix the users of the reduction variable
3862   // inside and outside of the scalar remainder loop.
3863   // We know that the loop is in LCSSA form. We need to update the
3864   // PHI nodes in the exit blocks.
3865   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3866     // All PHINodes need to have a single entry edge, or two if
3867     // we already fixed them.
3868     assert(LCSSAPhi.getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
3869 
3870     // We found a reduction value exit-PHI. Update it with the
3871     // incoming bypass edge.
3872     if (LCSSAPhi.getIncomingValue(0) == LoopExitInst)
3873       LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
3874   } // end of the LCSSA phi scan.
3875 
3876     // Fix the scalar loop reduction variable with the incoming reduction sum
3877     // from the vector body and from the backedge value.
3878   int IncomingEdgeBlockIdx =
3879     Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
3880   assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
3881   // Pick the other block.
3882   int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
3883   Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
3884   Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
3885 }
3886 
clearReductionWrapFlags(RecurrenceDescriptor & RdxDesc)3887 void InnerLoopVectorizer::clearReductionWrapFlags(
3888     RecurrenceDescriptor &RdxDesc) {
3889   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
3890   if (RK != RecurrenceDescriptor::RK_IntegerAdd &&
3891       RK != RecurrenceDescriptor::RK_IntegerMult)
3892     return;
3893 
3894   Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
3895   assert(LoopExitInstr && "null loop exit instruction");
3896   SmallVector<Instruction *, 8> Worklist;
3897   SmallPtrSet<Instruction *, 8> Visited;
3898   Worklist.push_back(LoopExitInstr);
3899   Visited.insert(LoopExitInstr);
3900 
3901   while (!Worklist.empty()) {
3902     Instruction *Cur = Worklist.pop_back_val();
3903     if (isa<OverflowingBinaryOperator>(Cur))
3904       for (unsigned Part = 0; Part < UF; ++Part) {
3905         Value *V = getOrCreateVectorValue(Cur, Part);
3906         cast<Instruction>(V)->dropPoisonGeneratingFlags();
3907       }
3908 
3909     for (User *U : Cur->users()) {
3910       Instruction *UI = cast<Instruction>(U);
3911       if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
3912           Visited.insert(UI).second)
3913         Worklist.push_back(UI);
3914     }
3915   }
3916 }
3917 
fixLCSSAPHIs()3918 void InnerLoopVectorizer::fixLCSSAPHIs() {
3919   for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
3920     if (LCSSAPhi.getNumIncomingValues() == 1) {
3921       auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
3922       // Non-instruction incoming values will have only one value.
3923       unsigned LastLane = 0;
3924       if (isa<Instruction>(IncomingValue))
3925           LastLane = Cost->isUniformAfterVectorization(
3926                          cast<Instruction>(IncomingValue), VF)
3927                          ? 0
3928                          : VF - 1;
3929       // Can be a loop invariant incoming value or the last scalar value to be
3930       // extracted from the vectorized loop.
3931       Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
3932       Value *lastIncomingValue =
3933           getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
3934       LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
3935     }
3936   }
3937 }
3938 
sinkScalarOperands(Instruction * PredInst)3939 void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
3940   // The basic block and loop containing the predicated instruction.
3941   auto *PredBB = PredInst->getParent();
3942   auto *VectorLoop = LI->getLoopFor(PredBB);
3943 
3944   // Initialize a worklist with the operands of the predicated instruction.
3945   SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
3946 
3947   // Holds instructions that we need to analyze again. An instruction may be
3948   // reanalyzed if we don't yet know if we can sink it or not.
3949   SmallVector<Instruction *, 8> InstsToReanalyze;
3950 
3951   // Returns true if a given use occurs in the predicated block. Phi nodes use
3952   // their operands in their corresponding predecessor blocks.
3953   auto isBlockOfUsePredicated = [&](Use &U) -> bool {
3954     auto *I = cast<Instruction>(U.getUser());
3955     BasicBlock *BB = I->getParent();
3956     if (auto *Phi = dyn_cast<PHINode>(I))
3957       BB = Phi->getIncomingBlock(
3958           PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
3959     return BB == PredBB;
3960   };
3961 
3962   // Iteratively sink the scalarized operands of the predicated instruction
3963   // into the block we created for it. When an instruction is sunk, it's
3964   // operands are then added to the worklist. The algorithm ends after one pass
3965   // through the worklist doesn't sink a single instruction.
3966   bool Changed;
3967   do {
3968     // Add the instructions that need to be reanalyzed to the worklist, and
3969     // reset the changed indicator.
3970     Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
3971     InstsToReanalyze.clear();
3972     Changed = false;
3973 
3974     while (!Worklist.empty()) {
3975       auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
3976 
3977       // We can't sink an instruction if it is a phi node, is already in the
3978       // predicated block, is not in the loop, or may have side effects.
3979       if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
3980           !VectorLoop->contains(I) || I->mayHaveSideEffects())
3981         continue;
3982 
3983       // It's legal to sink the instruction if all its uses occur in the
3984       // predicated block. Otherwise, there's nothing to do yet, and we may
3985       // need to reanalyze the instruction.
3986       if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
3987         InstsToReanalyze.push_back(I);
3988         continue;
3989       }
3990 
3991       // Move the instruction to the beginning of the predicated block, and add
3992       // it's operands to the worklist.
3993       I->moveBefore(&*PredBB->getFirstInsertionPt());
3994       Worklist.insert(I->op_begin(), I->op_end());
3995 
3996       // The sinking may have enabled other instructions to be sunk, so we will
3997       // need to iterate.
3998       Changed = true;
3999     }
4000   } while (Changed);
4001 }
4002 
fixNonInductionPHIs()4003 void InnerLoopVectorizer::fixNonInductionPHIs() {
4004   for (PHINode *OrigPhi : OrigPHIsToFix) {
4005     PHINode *NewPhi =
4006         cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
4007     unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
4008 
4009     SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
4010         predecessors(OrigPhi->getParent()));
4011     SmallVector<BasicBlock *, 2> VectorBBPredecessors(
4012         predecessors(NewPhi->getParent()));
4013     assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
4014            "Scalar and Vector BB should have the same number of predecessors");
4015 
4016     // The insertion point in Builder may be invalidated by the time we get
4017     // here. Force the Builder insertion point to something valid so that we do
4018     // not run into issues during insertion point restore in
4019     // getOrCreateVectorValue calls below.
4020     Builder.SetInsertPoint(NewPhi);
4021 
4022     // The predecessor order is preserved and we can rely on mapping between
4023     // scalar and vector block predecessors.
4024     for (unsigned i = 0; i < NumIncomingValues; ++i) {
4025       BasicBlock *NewPredBB = VectorBBPredecessors[i];
4026 
4027       // When looking up the new scalar/vector values to fix up, use incoming
4028       // values from original phi.
4029       Value *ScIncV =
4030           OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
4031 
4032       // Scalar incoming value may need a broadcast
4033       Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
4034       NewPhi->addIncoming(NewIncV, NewPredBB);
4035     }
4036   }
4037 }
4038 
widenGEP(GetElementPtrInst * GEP,unsigned UF,unsigned VF,bool IsPtrLoopInvariant,SmallBitVector & IsIndexLoopInvariant)4039 void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
4040                                    unsigned VF, bool IsPtrLoopInvariant,
4041                                    SmallBitVector &IsIndexLoopInvariant) {
4042   // Construct a vector GEP by widening the operands of the scalar GEP as
4043   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
4044   // results in a vector of pointers when at least one operand of the GEP
4045   // is vector-typed. Thus, to keep the representation compact, we only use
4046   // vector-typed operands for loop-varying values.
4047 
4048   if (VF > 1 && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
4049     // If we are vectorizing, but the GEP has only loop-invariant operands,
4050     // the GEP we build (by only using vector-typed operands for
4051     // loop-varying values) would be a scalar pointer. Thus, to ensure we
4052     // produce a vector of pointers, we need to either arbitrarily pick an
4053     // operand to broadcast, or broadcast a clone of the original GEP.
4054     // Here, we broadcast a clone of the original.
4055     //
4056     // TODO: If at some point we decide to scalarize instructions having
4057     //       loop-invariant operands, this special case will no longer be
4058     //       required. We would add the scalarization decision to
4059     //       collectLoopScalars() and teach getVectorValue() to broadcast
4060     //       the lane-zero scalar value.
4061     auto *Clone = Builder.Insert(GEP->clone());
4062     for (unsigned Part = 0; Part < UF; ++Part) {
4063       Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
4064       VectorLoopValueMap.setVectorValue(GEP, Part, EntryPart);
4065       addMetadata(EntryPart, GEP);
4066     }
4067   } else {
4068     // If the GEP has at least one loop-varying operand, we are sure to
4069     // produce a vector of pointers. But if we are only unrolling, we want
4070     // to produce a scalar GEP for each unroll part. Thus, the GEP we
4071     // produce with the code below will be scalar (if VF == 1) or vector
4072     // (otherwise). Note that for the unroll-only case, we still maintain
4073     // values in the vector mapping with initVector, as we do for other
4074     // instructions.
4075     for (unsigned Part = 0; Part < UF; ++Part) {
4076       // The pointer operand of the new GEP. If it's loop-invariant, we
4077       // won't broadcast it.
4078       auto *Ptr = IsPtrLoopInvariant
4079                       ? GEP->getPointerOperand()
4080                       : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
4081 
4082       // Collect all the indices for the new GEP. If any index is
4083       // loop-invariant, we won't broadcast it.
4084       SmallVector<Value *, 4> Indices;
4085       for (auto Index : enumerate(GEP->indices())) {
4086         Value *User = Index.value().get();
4087         if (IsIndexLoopInvariant[Index.index()])
4088           Indices.push_back(User);
4089         else
4090           Indices.push_back(getOrCreateVectorValue(User, Part));
4091       }
4092 
4093       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
4094       // but it should be a vector, otherwise.
4095       auto *NewGEP =
4096           GEP->isInBounds()
4097               ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
4098                                           Indices)
4099               : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
4100       assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
4101              "NewGEP is not a pointer vector");
4102       VectorLoopValueMap.setVectorValue(GEP, Part, NewGEP);
4103       addMetadata(NewGEP, GEP);
4104     }
4105   }
4106 }
4107 
widenPHIInstruction(Instruction * PN,unsigned UF,unsigned VF)4108 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
4109                                               unsigned VF) {
4110   PHINode *P = cast<PHINode>(PN);
4111   if (EnableVPlanNativePath) {
4112     // Currently we enter here in the VPlan-native path for non-induction
4113     // PHIs where all control flow is uniform. We simply widen these PHIs.
4114     // Create a vector phi with no operands - the vector phi operands will be
4115     // set at the end of vector code generation.
4116     Type *VecTy =
4117         (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4118     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
4119     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
4120     OrigPHIsToFix.push_back(P);
4121 
4122     return;
4123   }
4124 
4125   assert(PN->getParent() == OrigLoop->getHeader() &&
4126          "Non-header phis should have been handled elsewhere");
4127 
4128   // In order to support recurrences we need to be able to vectorize Phi nodes.
4129   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
4130   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
4131   // this value when we vectorize all of the instructions that use the PHI.
4132   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
4133     for (unsigned Part = 0; Part < UF; ++Part) {
4134       // This is phase one of vectorizing PHIs.
4135       Type *VecTy =
4136           (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
4137       Value *EntryPart = PHINode::Create(
4138           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
4139       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
4140     }
4141     return;
4142   }
4143 
4144   setDebugLocFromInst(Builder, P);
4145 
4146   // This PHINode must be an induction variable.
4147   // Make sure that we know about it.
4148   assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
4149 
4150   InductionDescriptor II = Legal->getInductionVars()->lookup(P);
4151   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
4152 
4153   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
4154   // which can be found from the original scalar operations.
4155   switch (II.getKind()) {
4156   case InductionDescriptor::IK_NoInduction:
4157     llvm_unreachable("Unknown induction");
4158   case InductionDescriptor::IK_IntInduction:
4159   case InductionDescriptor::IK_FpInduction:
4160     llvm_unreachable("Integer/fp induction is handled elsewhere.");
4161   case InductionDescriptor::IK_PtrInduction: {
4162     // Handle the pointer induction variable case.
4163     assert(P->getType()->isPointerTy() && "Unexpected type.");
4164     // This is the normalized GEP that starts counting at zero.
4165     Value *PtrInd = Induction;
4166     PtrInd = Builder.CreateSExtOrTrunc(PtrInd, II.getStep()->getType());
4167     // Determine the number of scalars we need to generate for each unroll
4168     // iteration. If the instruction is uniform, we only need to generate the
4169     // first lane. Otherwise, we generate all VF values.
4170     unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
4171     // These are the scalar results. Notice that we don't generate vector GEPs
4172     // because scalar GEPs result in better code.
4173     for (unsigned Part = 0; Part < UF; ++Part) {
4174       for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
4175         Constant *Idx = ConstantInt::get(PtrInd->getType(), Lane + Part * VF);
4176         Value *GlobalIdx = Builder.CreateAdd(PtrInd, Idx);
4177         Value *SclrGep =
4178             emitTransformedIndex(Builder, GlobalIdx, PSE.getSE(), DL, II);
4179         SclrGep->setName("next.gep");
4180         VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
4181       }
4182     }
4183     return;
4184   }
4185   }
4186 }
4187 
4188 /// A helper function for checking whether an integer division-related
4189 /// instruction may divide by zero (in which case it must be predicated if
4190 /// executed conditionally in the scalar code).
4191 /// TODO: It may be worthwhile to generalize and check isKnownNonZero().
4192 /// Non-zero divisors that are non compile-time constants will not be
4193 /// converted into multiplication, so we will still end up scalarizing
4194 /// the division, but can do so w/o predication.
mayDivideByZero(Instruction & I)4195 static bool mayDivideByZero(Instruction &I) {
4196   assert((I.getOpcode() == Instruction::UDiv ||
4197           I.getOpcode() == Instruction::SDiv ||
4198           I.getOpcode() == Instruction::URem ||
4199           I.getOpcode() == Instruction::SRem) &&
4200          "Unexpected instruction");
4201   Value *Divisor = I.getOperand(1);
4202   auto *CInt = dyn_cast<ConstantInt>(Divisor);
4203   return !CInt || CInt->isZero();
4204 }
4205 
widenInstruction(Instruction & I)4206 void InnerLoopVectorizer::widenInstruction(Instruction &I) {
4207   switch (I.getOpcode()) {
4208   case Instruction::Br:
4209   case Instruction::PHI:
4210   case Instruction::GetElementPtr:
4211     llvm_unreachable("This instruction is handled by a different recipe.");
4212   case Instruction::UDiv:
4213   case Instruction::SDiv:
4214   case Instruction::SRem:
4215   case Instruction::URem:
4216   case Instruction::Add:
4217   case Instruction::FAdd:
4218   case Instruction::Sub:
4219   case Instruction::FSub:
4220   case Instruction::FNeg:
4221   case Instruction::Mul:
4222   case Instruction::FMul:
4223   case Instruction::FDiv:
4224   case Instruction::FRem:
4225   case Instruction::Shl:
4226   case Instruction::LShr:
4227   case Instruction::AShr:
4228   case Instruction::And:
4229   case Instruction::Or:
4230   case Instruction::Xor: {
4231     // Just widen unops and binops.
4232     setDebugLocFromInst(Builder, &I);
4233 
4234     for (unsigned Part = 0; Part < UF; ++Part) {
4235       SmallVector<Value *, 2> Ops;
4236       for (Value *Op : I.operands())
4237         Ops.push_back(getOrCreateVectorValue(Op, Part));
4238 
4239       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
4240 
4241       if (auto *VecOp = dyn_cast<Instruction>(V))
4242         VecOp->copyIRFlags(&I);
4243 
4244       // Use this vector value for all users of the original instruction.
4245       VectorLoopValueMap.setVectorValue(&I, Part, V);
4246       addMetadata(V, &I);
4247     }
4248 
4249     break;
4250   }
4251   case Instruction::Select: {
4252     // Widen selects.
4253     // If the selector is loop invariant we can create a select
4254     // instruction with a scalar condition. Otherwise, use vector-select.
4255     auto *SE = PSE.getSE();
4256     bool InvariantCond =
4257         SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
4258     setDebugLocFromInst(Builder, &I);
4259 
4260     // The condition can be loop invariant  but still defined inside the
4261     // loop. This means that we can't just use the original 'cond' value.
4262     // We have to take the 'vectorized' value and pick the first lane.
4263     // Instcombine will make this a no-op.
4264 
4265     auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
4266 
4267     for (unsigned Part = 0; Part < UF; ++Part) {
4268       Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
4269       Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
4270       Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
4271       Value *Sel =
4272           Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
4273       VectorLoopValueMap.setVectorValue(&I, Part, Sel);
4274       addMetadata(Sel, &I);
4275     }
4276 
4277     break;
4278   }
4279 
4280   case Instruction::ICmp:
4281   case Instruction::FCmp: {
4282     // Widen compares. Generate vector compares.
4283     bool FCmp = (I.getOpcode() == Instruction::FCmp);
4284     auto *Cmp = cast<CmpInst>(&I);
4285     setDebugLocFromInst(Builder, Cmp);
4286     for (unsigned Part = 0; Part < UF; ++Part) {
4287       Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
4288       Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
4289       Value *C = nullptr;
4290       if (FCmp) {
4291         // Propagate fast math flags.
4292         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
4293         Builder.setFastMathFlags(Cmp->getFastMathFlags());
4294         C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
4295       } else {
4296         C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
4297       }
4298       VectorLoopValueMap.setVectorValue(&I, Part, C);
4299       addMetadata(C, &I);
4300     }
4301 
4302     break;
4303   }
4304 
4305   case Instruction::ZExt:
4306   case Instruction::SExt:
4307   case Instruction::FPToUI:
4308   case Instruction::FPToSI:
4309   case Instruction::FPExt:
4310   case Instruction::PtrToInt:
4311   case Instruction::IntToPtr:
4312   case Instruction::SIToFP:
4313   case Instruction::UIToFP:
4314   case Instruction::Trunc:
4315   case Instruction::FPTrunc:
4316   case Instruction::BitCast: {
4317     auto *CI = cast<CastInst>(&I);
4318     setDebugLocFromInst(Builder, CI);
4319 
4320     /// Vectorize casts.
4321     Type *DestTy =
4322         (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
4323 
4324     for (unsigned Part = 0; Part < UF; ++Part) {
4325       Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
4326       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
4327       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
4328       addMetadata(Cast, &I);
4329     }
4330     break;
4331   }
4332 
4333   case Instruction::Call: {
4334     // Ignore dbg intrinsics.
4335     if (isa<DbgInfoIntrinsic>(I))
4336       break;
4337     setDebugLocFromInst(Builder, &I);
4338 
4339     Module *M = I.getParent()->getParent()->getParent();
4340     auto *CI = cast<CallInst>(&I);
4341 
4342     StringRef FnName = CI->getCalledFunction()->getName();
4343     Function *F = CI->getCalledFunction();
4344     Type *RetTy = ToVectorTy(CI->getType(), VF);
4345     SmallVector<Type *, 4> Tys;
4346     for (Value *ArgOperand : CI->arg_operands())
4347       Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
4348 
4349     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
4350 
4351     // The flag shows whether we use Intrinsic or a usual Call for vectorized
4352     // version of the instruction.
4353     // Is it beneficial to perform intrinsic call compared to lib call?
4354     bool NeedToScalarize;
4355     unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
4356     bool UseVectorIntrinsic =
4357         ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
4358     assert((UseVectorIntrinsic || !NeedToScalarize) &&
4359            "Instruction should be scalarized elsewhere.");
4360 
4361     for (unsigned Part = 0; Part < UF; ++Part) {
4362       SmallVector<Value *, 4> Args;
4363       for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
4364         Value *Arg = CI->getArgOperand(i);
4365         // Some intrinsics have a scalar argument - don't replace it with a
4366         // vector.
4367         if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
4368           Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
4369         Args.push_back(Arg);
4370       }
4371 
4372       Function *VectorF;
4373       if (UseVectorIntrinsic) {
4374         // Use vector version of the intrinsic.
4375         Type *TysForDecl[] = {CI->getType()};
4376         if (VF > 1)
4377           TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
4378         VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
4379       } else {
4380         // Use vector version of the library call.
4381         StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
4382         assert(!VFnName.empty() && "Vector function name is empty.");
4383         VectorF = M->getFunction(VFnName);
4384         if (!VectorF) {
4385           // Generate a declaration
4386           FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
4387           VectorF =
4388               Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
4389           VectorF->copyAttributesFrom(F);
4390         }
4391       }
4392       assert(VectorF && "Can't create vector function.");
4393 
4394       SmallVector<OperandBundleDef, 1> OpBundles;
4395       CI->getOperandBundlesAsDefs(OpBundles);
4396       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
4397 
4398       if (isa<FPMathOperator>(V))
4399         V->copyFastMathFlags(CI);
4400 
4401       VectorLoopValueMap.setVectorValue(&I, Part, V);
4402       addMetadata(V, &I);
4403     }
4404 
4405     break;
4406   }
4407 
4408   default:
4409     // This instruction is not vectorized by simple widening.
4410     LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
4411     llvm_unreachable("Unhandled instruction!");
4412   } // end of switch.
4413 }
4414 
collectLoopScalars(unsigned VF)4415 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
4416   // We should not collect Scalars more than once per VF. Right now, this
4417   // function is called from collectUniformsAndScalars(), which already does
4418   // this check. Collecting Scalars for VF=1 does not make any sense.
4419   assert(VF >= 2 && Scalars.find(VF) == Scalars.end() &&
4420          "This function should not be visited twice for the same VF");
4421 
4422   SmallSetVector<Instruction *, 8> Worklist;
4423 
4424   // These sets are used to seed the analysis with pointers used by memory
4425   // accesses that will remain scalar.
4426   SmallSetVector<Instruction *, 8> ScalarPtrs;
4427   SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
4428 
4429   // A helper that returns true if the use of Ptr by MemAccess will be scalar.
4430   // The pointer operands of loads and stores will be scalar as long as the
4431   // memory access is not a gather or scatter operation. The value operand of a
4432   // store will remain scalar if the store is scalarized.
4433   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
4434     InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
4435     assert(WideningDecision != CM_Unknown &&
4436            "Widening decision should be ready at this moment");
4437     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
4438       if (Ptr == Store->getValueOperand())
4439         return WideningDecision == CM_Scalarize;
4440     assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
4441            "Ptr is neither a value or pointer operand");
4442     return WideningDecision != CM_GatherScatter;
4443   };
4444 
4445   // A helper that returns true if the given value is a bitcast or
4446   // getelementptr instruction contained in the loop.
4447   auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
4448     return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
4449             isa<GetElementPtrInst>(V)) &&
4450            !TheLoop->isLoopInvariant(V);
4451   };
4452 
4453   // A helper that evaluates a memory access's use of a pointer. If the use
4454   // will be a scalar use, and the pointer is only used by memory accesses, we
4455   // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
4456   // PossibleNonScalarPtrs.
4457   auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
4458     // We only care about bitcast and getelementptr instructions contained in
4459     // the loop.
4460     if (!isLoopVaryingBitCastOrGEP(Ptr))
4461       return;
4462 
4463     // If the pointer has already been identified as scalar (e.g., if it was
4464     // also identified as uniform), there's nothing to do.
4465     auto *I = cast<Instruction>(Ptr);
4466     if (Worklist.count(I))
4467       return;
4468 
4469     // If the use of the pointer will be a scalar use, and all users of the
4470     // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
4471     // place the pointer in PossibleNonScalarPtrs.
4472     if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
4473           return isa<LoadInst>(U) || isa<StoreInst>(U);
4474         }))
4475       ScalarPtrs.insert(I);
4476     else
4477       PossibleNonScalarPtrs.insert(I);
4478   };
4479 
4480   // We seed the scalars analysis with three classes of instructions: (1)
4481   // instructions marked uniform-after-vectorization, (2) bitcast and
4482   // getelementptr instructions used by memory accesses requiring a scalar use,
4483   // and (3) pointer induction variables and their update instructions (we
4484   // currently only scalarize these).
4485   //
4486   // (1) Add to the worklist all instructions that have been identified as
4487   // uniform-after-vectorization.
4488   Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
4489 
4490   // (2) Add to the worklist all bitcast and getelementptr instructions used by
4491   // memory accesses requiring a scalar use. The pointer operands of loads and
4492   // stores will be scalar as long as the memory accesses is not a gather or
4493   // scatter operation. The value operand of a store will remain scalar if the
4494   // store is scalarized.
4495   for (auto *BB : TheLoop->blocks())
4496     for (auto &I : *BB) {
4497       if (auto *Load = dyn_cast<LoadInst>(&I)) {
4498         evaluatePtrUse(Load, Load->getPointerOperand());
4499       } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
4500         evaluatePtrUse(Store, Store->getPointerOperand());
4501         evaluatePtrUse(Store, Store->getValueOperand());
4502       }
4503     }
4504   for (auto *I : ScalarPtrs)
4505     if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
4506       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
4507       Worklist.insert(I);
4508     }
4509 
4510   // (3) Add to the worklist all pointer induction variables and their update
4511   // instructions.
4512   //
4513   // TODO: Once we are able to vectorize pointer induction variables we should
4514   //       no longer insert them into the worklist here.
4515   auto *Latch = TheLoop->getLoopLatch();
4516   for (auto &Induction : *Legal->getInductionVars()) {
4517     auto *Ind = Induction.first;
4518     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4519     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
4520       continue;
4521     Worklist.insert(Ind);
4522     Worklist.insert(IndUpdate);
4523     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4524     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4525                       << "\n");
4526   }
4527 
4528   // Insert the forced scalars.
4529   // FIXME: Currently widenPHIInstruction() often creates a dead vector
4530   // induction variable when the PHI user is scalarized.
4531   auto ForcedScalar = ForcedScalars.find(VF);
4532   if (ForcedScalar != ForcedScalars.end())
4533     for (auto *I : ForcedScalar->second)
4534       Worklist.insert(I);
4535 
4536   // Expand the worklist by looking through any bitcasts and getelementptr
4537   // instructions we've already identified as scalar. This is similar to the
4538   // expansion step in collectLoopUniforms(); however, here we're only
4539   // expanding to include additional bitcasts and getelementptr instructions.
4540   unsigned Idx = 0;
4541   while (Idx != Worklist.size()) {
4542     Instruction *Dst = Worklist[Idx++];
4543     if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
4544       continue;
4545     auto *Src = cast<Instruction>(Dst->getOperand(0));
4546     if (llvm::all_of(Src->users(), [&](User *U) -> bool {
4547           auto *J = cast<Instruction>(U);
4548           return !TheLoop->contains(J) || Worklist.count(J) ||
4549                  ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
4550                   isScalarUse(J, Src));
4551         })) {
4552       Worklist.insert(Src);
4553       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
4554     }
4555   }
4556 
4557   // An induction variable will remain scalar if all users of the induction
4558   // variable and induction variable update remain scalar.
4559   for (auto &Induction : *Legal->getInductionVars()) {
4560     auto *Ind = Induction.first;
4561     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4562 
4563     // We already considered pointer induction variables, so there's no reason
4564     // to look at their users again.
4565     //
4566     // TODO: Once we are able to vectorize pointer induction variables we
4567     //       should no longer skip over them here.
4568     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
4569       continue;
4570 
4571     // Determine if all users of the induction variable are scalar after
4572     // vectorization.
4573     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4574       auto *I = cast<Instruction>(U);
4575       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
4576     });
4577     if (!ScalarInd)
4578       continue;
4579 
4580     // Determine if all users of the induction variable update instruction are
4581     // scalar after vectorization.
4582     auto ScalarIndUpdate =
4583         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4584           auto *I = cast<Instruction>(U);
4585           return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
4586         });
4587     if (!ScalarIndUpdate)
4588       continue;
4589 
4590     // The induction variable and its update instruction will remain scalar.
4591     Worklist.insert(Ind);
4592     Worklist.insert(IndUpdate);
4593     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
4594     LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
4595                       << "\n");
4596   }
4597 
4598   Scalars[VF].insert(Worklist.begin(), Worklist.end());
4599 }
4600 
isScalarWithPredication(Instruction * I,unsigned VF)4601 bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigned VF) {
4602   if (!blockNeedsPredication(I->getParent()))
4603     return false;
4604   switch(I->getOpcode()) {
4605   default:
4606     break;
4607   case Instruction::Load:
4608   case Instruction::Store: {
4609     if (!Legal->isMaskRequired(I))
4610       return false;
4611     auto *Ptr = getLoadStorePointerOperand(I);
4612     auto *Ty = getMemInstValueType(I);
4613     // We have already decided how to vectorize this instruction, get that
4614     // result.
4615     if (VF > 1) {
4616       InstWidening WideningDecision = getWideningDecision(I, VF);
4617       assert(WideningDecision != CM_Unknown &&
4618              "Widening decision should be ready at this moment");
4619       return WideningDecision == CM_Scalarize;
4620     }
4621     const MaybeAlign Alignment = getLoadStoreAlignment(I);
4622     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
4623                                 isLegalMaskedGather(Ty, Alignment))
4624                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
4625                                 isLegalMaskedScatter(Ty, Alignment));
4626   }
4627   case Instruction::UDiv:
4628   case Instruction::SDiv:
4629   case Instruction::SRem:
4630   case Instruction::URem:
4631     return mayDivideByZero(*I);
4632   }
4633   return false;
4634 }
4635 
interleavedAccessCanBeWidened(Instruction * I,unsigned VF)4636 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4637                                                                unsigned VF) {
4638   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
4639   assert(getWideningDecision(I, VF) == CM_Unknown &&
4640          "Decision should not be set yet.");
4641   auto *Group = getInterleavedAccessGroup(I);
4642   assert(Group && "Must have a group.");
4643 
4644   // If the instruction's allocated size doesn't equal it's type size, it
4645   // requires padding and will be scalarized.
4646   auto &DL = I->getModule()->getDataLayout();
4647   auto *ScalarTy = getMemInstValueType(I);
4648   if (hasIrregularType(ScalarTy, DL, VF))
4649     return false;
4650 
4651   // Check if masking is required.
4652   // A Group may need masking for one of two reasons: it resides in a block that
4653   // needs predication, or it was decided to use masking to deal with gaps.
4654   bool PredicatedAccessRequiresMasking =
4655       Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
4656   bool AccessWithGapsRequiresMasking =
4657       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
4658   if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4659     return true;
4660 
4661   // If masked interleaving is required, we expect that the user/target had
4662   // enabled it, because otherwise it either wouldn't have been created or
4663   // it should have been invalidated by the CostModel.
4664   assert(useMaskedInterleavedAccesses(TTI) &&
4665          "Masked interleave-groups for predicated accesses are not enabled.");
4666 
4667   auto *Ty = getMemInstValueType(I);
4668   const MaybeAlign Alignment = getLoadStoreAlignment(I);
4669   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
4670                           : TTI.isLegalMaskedStore(Ty, Alignment);
4671 }
4672 
memoryInstructionCanBeWidened(Instruction * I,unsigned VF)4673 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(Instruction *I,
4674                                                                unsigned VF) {
4675   // Get and ensure we have a valid memory instruction.
4676   LoadInst *LI = dyn_cast<LoadInst>(I);
4677   StoreInst *SI = dyn_cast<StoreInst>(I);
4678   assert((LI || SI) && "Invalid memory instruction");
4679 
4680   auto *Ptr = getLoadStorePointerOperand(I);
4681 
4682   // In order to be widened, the pointer should be consecutive, first of all.
4683   if (!Legal->isConsecutivePtr(Ptr))
4684     return false;
4685 
4686   // If the instruction is a store located in a predicated block, it will be
4687   // scalarized.
4688   if (isScalarWithPredication(I))
4689     return false;
4690 
4691   // If the instruction's allocated size doesn't equal it's type size, it
4692   // requires padding and will be scalarized.
4693   auto &DL = I->getModule()->getDataLayout();
4694   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
4695   if (hasIrregularType(ScalarTy, DL, VF))
4696     return false;
4697 
4698   return true;
4699 }
4700 
collectLoopUniforms(unsigned VF)4701 void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4702   // We should not collect Uniforms more than once per VF. Right now,
4703   // this function is called from collectUniformsAndScalars(), which
4704   // already does this check. Collecting Uniforms for VF=1 does not make any
4705   // sense.
4706 
4707   assert(VF >= 2 && Uniforms.find(VF) == Uniforms.end() &&
4708          "This function should not be visited twice for the same VF");
4709 
4710   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
4711   // not analyze again.  Uniforms.count(VF) will return 1.
4712   Uniforms[VF].clear();
4713 
4714   // We now know that the loop is vectorizable!
4715   // Collect instructions inside the loop that will remain uniform after
4716   // vectorization.
4717 
4718   // Global values, params and instructions outside of current loop are out of
4719   // scope.
4720   auto isOutOfScope = [&](Value *V) -> bool {
4721     Instruction *I = dyn_cast<Instruction>(V);
4722     return (!I || !TheLoop->contains(I));
4723   };
4724 
4725   SetVector<Instruction *> Worklist;
4726   BasicBlock *Latch = TheLoop->getLoopLatch();
4727 
4728   // Instructions that are scalar with predication must not be considered
4729   // uniform after vectorization, because that would create an erroneous
4730   // replicating region where only a single instance out of VF should be formed.
4731   // TODO: optimize such seldom cases if found important, see PR40816.
4732   auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
4733     if (isScalarWithPredication(I, VF)) {
4734       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
4735                         << *I << "\n");
4736       return;
4737     }
4738     LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
4739     Worklist.insert(I);
4740   };
4741 
4742   // Start with the conditional branch. If the branch condition is an
4743   // instruction contained in the loop that is only used by the branch, it is
4744   // uniform.
4745   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
4746   if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
4747     addToWorklistIfAllowed(Cmp);
4748 
4749   // Holds consecutive and consecutive-like pointers. Consecutive-like pointers
4750   // are pointers that are treated like consecutive pointers during
4751   // vectorization. The pointer operands of interleaved accesses are an
4752   // example.
4753   SmallSetVector<Instruction *, 8> ConsecutiveLikePtrs;
4754 
4755   // Holds pointer operands of instructions that are possibly non-uniform.
4756   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
4757 
4758   auto isUniformDecision = [&](Instruction *I, unsigned VF) {
4759     InstWidening WideningDecision = getWideningDecision(I, VF);
4760     assert(WideningDecision != CM_Unknown &&
4761            "Widening decision should be ready at this moment");
4762 
4763     return (WideningDecision == CM_Widen ||
4764             WideningDecision == CM_Widen_Reverse ||
4765             WideningDecision == CM_Interleave);
4766   };
4767   // Iterate over the instructions in the loop, and collect all
4768   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
4769   // that a consecutive-like pointer operand will be scalarized, we collect it
4770   // in PossibleNonUniformPtrs instead. We use two sets here because a single
4771   // getelementptr instruction can be used by both vectorized and scalarized
4772   // memory instructions. For example, if a loop loads and stores from the same
4773   // location, but the store is conditional, the store will be scalarized, and
4774   // the getelementptr won't remain uniform.
4775   for (auto *BB : TheLoop->blocks())
4776     for (auto &I : *BB) {
4777       // If there's no pointer operand, there's nothing to do.
4778       auto *Ptr = dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
4779       if (!Ptr)
4780         continue;
4781 
4782       // True if all users of Ptr are memory accesses that have Ptr as their
4783       // pointer operand.
4784       auto UsersAreMemAccesses =
4785           llvm::all_of(Ptr->users(), [&](User *U) -> bool {
4786             return getLoadStorePointerOperand(U) == Ptr;
4787           });
4788 
4789       // Ensure the memory instruction will not be scalarized or used by
4790       // gather/scatter, making its pointer operand non-uniform. If the pointer
4791       // operand is used by any instruction other than a memory access, we
4792       // conservatively assume the pointer operand may be non-uniform.
4793       if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
4794         PossibleNonUniformPtrs.insert(Ptr);
4795 
4796       // If the memory instruction will be vectorized and its pointer operand
4797       // is consecutive-like, or interleaving - the pointer operand should
4798       // remain uniform.
4799       else
4800         ConsecutiveLikePtrs.insert(Ptr);
4801     }
4802 
4803   // Add to the Worklist all consecutive and consecutive-like pointers that
4804   // aren't also identified as possibly non-uniform.
4805   for (auto *V : ConsecutiveLikePtrs)
4806     if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
4807       addToWorklistIfAllowed(V);
4808 
4809   // Expand Worklist in topological order: whenever a new instruction
4810   // is added , its users should be already inside Worklist.  It ensures
4811   // a uniform instruction will only be used by uniform instructions.
4812   unsigned idx = 0;
4813   while (idx != Worklist.size()) {
4814     Instruction *I = Worklist[idx++];
4815 
4816     for (auto OV : I->operand_values()) {
4817       // isOutOfScope operands cannot be uniform instructions.
4818       if (isOutOfScope(OV))
4819         continue;
4820       // First order recurrence Phi's should typically be considered
4821       // non-uniform.
4822       auto *OP = dyn_cast<PHINode>(OV);
4823       if (OP && Legal->isFirstOrderRecurrence(OP))
4824         continue;
4825       // If all the users of the operand are uniform, then add the
4826       // operand into the uniform worklist.
4827       auto *OI = cast<Instruction>(OV);
4828       if (llvm::all_of(OI->users(), [&](User *U) -> bool {
4829             auto *J = cast<Instruction>(U);
4830             return Worklist.count(J) ||
4831                    (OI == getLoadStorePointerOperand(J) &&
4832                     isUniformDecision(J, VF));
4833           }))
4834         addToWorklistIfAllowed(OI);
4835     }
4836   }
4837 
4838   // Returns true if Ptr is the pointer operand of a memory access instruction
4839   // I, and I is known to not require scalarization.
4840   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
4841     return getLoadStorePointerOperand(I) == Ptr && isUniformDecision(I, VF);
4842   };
4843 
4844   // For an instruction to be added into Worklist above, all its users inside
4845   // the loop should also be in Worklist. However, this condition cannot be
4846   // true for phi nodes that form a cyclic dependence. We must process phi
4847   // nodes separately. An induction variable will remain uniform if all users
4848   // of the induction variable and induction variable update remain uniform.
4849   // The code below handles both pointer and non-pointer induction variables.
4850   for (auto &Induction : *Legal->getInductionVars()) {
4851     auto *Ind = Induction.first;
4852     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
4853 
4854     // Determine if all users of the induction variable are uniform after
4855     // vectorization.
4856     auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
4857       auto *I = cast<Instruction>(U);
4858       return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
4859              isVectorizedMemAccessUse(I, Ind);
4860     });
4861     if (!UniformInd)
4862       continue;
4863 
4864     // Determine if all users of the induction variable update instruction are
4865     // uniform after vectorization.
4866     auto UniformIndUpdate =
4867         llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
4868           auto *I = cast<Instruction>(U);
4869           return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
4870                  isVectorizedMemAccessUse(I, IndUpdate);
4871         });
4872     if (!UniformIndUpdate)
4873       continue;
4874 
4875     // The induction variable and its update instruction will remain uniform.
4876     addToWorklistIfAllowed(Ind);
4877     addToWorklistIfAllowed(IndUpdate);
4878   }
4879 
4880   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
4881 }
4882 
runtimeChecksRequired()4883 bool LoopVectorizationCostModel::runtimeChecksRequired() {
4884   LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
4885 
4886   if (Legal->getRuntimePointerChecking()->Need) {
4887     reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
4888         "runtime pointer checks needed. Enable vectorization of this "
4889         "loop with '#pragma clang loop vectorize(enable)' when "
4890         "compiling with -Os/-Oz",
4891         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4892     return true;
4893   }
4894 
4895   if (!PSE.getUnionPredicate().getPredicates().empty()) {
4896     reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
4897         "runtime SCEV checks needed. Enable vectorization of this "
4898         "loop with '#pragma clang loop vectorize(enable)' when "
4899         "compiling with -Os/-Oz",
4900         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4901     return true;
4902   }
4903 
4904   // FIXME: Avoid specializing for stride==1 instead of bailing out.
4905   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
4906     reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
4907         "runtime stride == 1 checks needed. Enable vectorization of "
4908         "this loop with '#pragma clang loop vectorize(enable)' when "
4909         "compiling with -Os/-Oz",
4910         "CantVersionLoopWithOptForSize", ORE, TheLoop);
4911     return true;
4912   }
4913 
4914   return false;
4915 }
4916 
computeMaxVF()4917 Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
4918   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
4919     // TODO: It may by useful to do since it's still likely to be dynamically
4920     // uniform if the target can skip.
4921     reportVectorizationFailure(
4922         "Not inserting runtime ptr check for divergent target",
4923         "runtime pointer checks needed. Not enabled for divergent target",
4924         "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
4925     return None;
4926   }
4927 
4928   unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
4929   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
4930   if (TC == 1) {
4931     reportVectorizationFailure("Single iteration (non) loop",
4932         "loop trip count is one, irrelevant for vectorization",
4933         "SingleIterationLoop", ORE, TheLoop);
4934     return None;
4935   }
4936 
4937   switch (ScalarEpilogueStatus) {
4938   case CM_ScalarEpilogueAllowed:
4939     return computeFeasibleMaxVF(TC);
4940   case CM_ScalarEpilogueNotNeededUsePredicate:
4941     LLVM_DEBUG(
4942         dbgs() << "LV: vector predicate hint/switch found.\n"
4943                << "LV: Not allowing scalar epilogue, creating predicated "
4944                << "vector loop.\n");
4945     break;
4946   case CM_ScalarEpilogueNotAllowedLowTripLoop:
4947     // fallthrough as a special case of OptForSize
4948   case CM_ScalarEpilogueNotAllowedOptSize:
4949     if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
4950       LLVM_DEBUG(
4951           dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
4952     else
4953       LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
4954                         << "count.\n");
4955 
4956     // Bail if runtime checks are required, which are not good when optimising
4957     // for size.
4958     if (runtimeChecksRequired())
4959       return None;
4960     break;
4961   }
4962 
4963   // Now try the tail folding
4964 
4965   // Invalidate interleave groups that require an epilogue if we can't mask
4966   // the interleave-group.
4967   if (!useMaskedInterleavedAccesses(TTI))
4968     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
4969 
4970   unsigned MaxVF = computeFeasibleMaxVF(TC);
4971   if (TC > 0 && TC % MaxVF == 0) {
4972     // Accept MaxVF if we do not have a tail.
4973     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
4974     return MaxVF;
4975   }
4976 
4977   // If we don't know the precise trip count, or if the trip count that we
4978   // found modulo the vectorization factor is not zero, try to fold the tail
4979   // by masking.
4980   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
4981   if (Legal->prepareToFoldTailByMasking()) {
4982     FoldTailByMasking = true;
4983     return MaxVF;
4984   }
4985 
4986   if (TC == 0) {
4987     reportVectorizationFailure(
4988         "Unable to calculate the loop count due to complex control flow",
4989         "unable to calculate the loop count due to complex control flow",
4990         "UnknownLoopCountComplexCFG", ORE, TheLoop);
4991     return None;
4992   }
4993 
4994   reportVectorizationFailure(
4995       "Cannot optimize for size and vectorize at the same time.",
4996       "cannot optimize for size and vectorize at the same time. "
4997       "Enable vectorization of this loop with '#pragma clang loop "
4998       "vectorize(enable)' when compiling with -Os/-Oz",
4999       "NoTailLoopWithOptForSize", ORE, TheLoop);
5000   return None;
5001 }
5002 
5003 unsigned
computeFeasibleMaxVF(unsigned ConstTripCount)5004 LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
5005   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
5006   unsigned SmallestType, WidestType;
5007   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
5008   unsigned WidestRegister = TTI.getRegisterBitWidth(true);
5009 
5010   // Get the maximum safe dependence distance in bits computed by LAA.
5011   // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
5012   // the memory accesses that is most restrictive (involved in the smallest
5013   // dependence distance).
5014   unsigned MaxSafeRegisterWidth = Legal->getMaxSafeRegisterWidth();
5015 
5016   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
5017 
5018   unsigned MaxVectorSize = WidestRegister / WidestType;
5019 
5020   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
5021                     << " / " << WidestType << " bits.\n");
5022   LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
5023                     << WidestRegister << " bits.\n");
5024 
5025   assert(MaxVectorSize <= 256 && "Did not expect to pack so many elements"
5026                                  " into one vector!");
5027   if (MaxVectorSize == 0) {
5028     LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
5029     MaxVectorSize = 1;
5030     return MaxVectorSize;
5031   } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
5032              isPowerOf2_32(ConstTripCount)) {
5033     // We need to clamp the VF to be the ConstTripCount. There is no point in
5034     // choosing a higher viable VF as done in the loop below.
5035     LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
5036                       << ConstTripCount << "\n");
5037     MaxVectorSize = ConstTripCount;
5038     return MaxVectorSize;
5039   }
5040 
5041   unsigned MaxVF = MaxVectorSize;
5042   if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
5043       (MaximizeBandwidth && isScalarEpilogueAllowed())) {
5044     // Collect all viable vectorization factors larger than the default MaxVF
5045     // (i.e. MaxVectorSize).
5046     SmallVector<unsigned, 8> VFs;
5047     unsigned NewMaxVectorSize = WidestRegister / SmallestType;
5048     for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
5049       VFs.push_back(VS);
5050 
5051     // For each VF calculate its register usage.
5052     auto RUs = calculateRegisterUsage(VFs);
5053 
5054     // Select the largest VF which doesn't require more registers than existing
5055     // ones.
5056     for (int i = RUs.size() - 1; i >= 0; --i) {
5057       bool Selected = true;
5058       for (auto& pair : RUs[i].MaxLocalUsers) {
5059         unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5060         if (pair.second > TargetNumRegisters)
5061           Selected = false;
5062       }
5063       if (Selected) {
5064         MaxVF = VFs[i];
5065         break;
5066       }
5067     }
5068     if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
5069       if (MaxVF < MinVF) {
5070         LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
5071                           << ") with target's minimum: " << MinVF << '\n');
5072         MaxVF = MinVF;
5073       }
5074     }
5075   }
5076   return MaxVF;
5077 }
5078 
5079 VectorizationFactor
selectVectorizationFactor(unsigned MaxVF)5080 LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
5081   float Cost = expectedCost(1).first;
5082   const float ScalarCost = Cost;
5083   unsigned Width = 1;
5084   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
5085 
5086   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
5087   if (ForceVectorization && MaxVF > 1) {
5088     // Ignore scalar width, because the user explicitly wants vectorization.
5089     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
5090     // evaluation.
5091     Cost = std::numeric_limits<float>::max();
5092   }
5093 
5094   for (unsigned i = 2; i <= MaxVF; i *= 2) {
5095     // Notice that the vector loop needs to be executed less times, so
5096     // we need to divide the cost of the vector loops by the width of
5097     // the vector elements.
5098     VectorizationCostTy C = expectedCost(i);
5099     float VectorCost = C.first / (float)i;
5100     LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
5101                       << " costs: " << (int)VectorCost << ".\n");
5102     if (!C.second && !ForceVectorization) {
5103       LLVM_DEBUG(
5104           dbgs() << "LV: Not considering vector loop of width " << i
5105                  << " because it will not generate any vector instructions.\n");
5106       continue;
5107     }
5108     if (VectorCost < Cost) {
5109       Cost = VectorCost;
5110       Width = i;
5111     }
5112   }
5113 
5114   if (!EnableCondStoresVectorization && NumPredStores) {
5115     reportVectorizationFailure("There are conditional stores.",
5116         "store that is conditionally executed prevents vectorization",
5117         "ConditionalStore", ORE, TheLoop);
5118     Width = 1;
5119     Cost = ScalarCost;
5120   }
5121 
5122   LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
5123              << "LV: Vectorization seems to be not beneficial, "
5124              << "but was forced by a user.\n");
5125   LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
5126   VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
5127   return Factor;
5128 }
5129 
5130 std::pair<unsigned, unsigned>
getSmallestAndWidestTypes()5131 LoopVectorizationCostModel::getSmallestAndWidestTypes() {
5132   unsigned MinWidth = -1U;
5133   unsigned MaxWidth = 8;
5134   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5135 
5136   // For each block.
5137   for (BasicBlock *BB : TheLoop->blocks()) {
5138     // For each instruction in the loop.
5139     for (Instruction &I : BB->instructionsWithoutDebug()) {
5140       Type *T = I.getType();
5141 
5142       // Skip ignored values.
5143       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
5144         continue;
5145 
5146       // Only examine Loads, Stores and PHINodes.
5147       if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
5148         continue;
5149 
5150       // Examine PHI nodes that are reduction variables. Update the type to
5151       // account for the recurrence type.
5152       if (auto *PN = dyn_cast<PHINode>(&I)) {
5153         if (!Legal->isReductionVariable(PN))
5154           continue;
5155         RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
5156         T = RdxDesc.getRecurrenceType();
5157       }
5158 
5159       // Examine the stored values.
5160       if (auto *ST = dyn_cast<StoreInst>(&I))
5161         T = ST->getValueOperand()->getType();
5162 
5163       // Ignore loaded pointer types and stored pointer types that are not
5164       // vectorizable.
5165       //
5166       // FIXME: The check here attempts to predict whether a load or store will
5167       //        be vectorized. We only know this for certain after a VF has
5168       //        been selected. Here, we assume that if an access can be
5169       //        vectorized, it will be. We should also look at extending this
5170       //        optimization to non-pointer types.
5171       //
5172       if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
5173           !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
5174         continue;
5175 
5176       MinWidth = std::min(MinWidth,
5177                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5178       MaxWidth = std::max(MaxWidth,
5179                           (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
5180     }
5181   }
5182 
5183   return {MinWidth, MaxWidth};
5184 }
5185 
selectInterleaveCount(unsigned VF,unsigned LoopCost)5186 unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
5187                                                            unsigned LoopCost) {
5188   // -- The interleave heuristics --
5189   // We interleave the loop in order to expose ILP and reduce the loop overhead.
5190   // There are many micro-architectural considerations that we can't predict
5191   // at this level. For example, frontend pressure (on decode or fetch) due to
5192   // code size, or the number and capabilities of the execution ports.
5193   //
5194   // We use the following heuristics to select the interleave count:
5195   // 1. If the code has reductions, then we interleave to break the cross
5196   // iteration dependency.
5197   // 2. If the loop is really small, then we interleave to reduce the loop
5198   // overhead.
5199   // 3. We don't interleave if we think that we will spill registers to memory
5200   // due to the increased register pressure.
5201 
5202   if (!isScalarEpilogueAllowed())
5203     return 1;
5204 
5205   // We used the distance for the interleave count.
5206   if (Legal->getMaxSafeDepDistBytes() != -1U)
5207     return 1;
5208 
5209   // Do not interleave loops with a relatively small known or estimated trip
5210   // count.
5211   auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
5212   if (BestKnownTC && *BestKnownTC < TinyTripCountInterleaveThreshold)
5213     return 1;
5214 
5215   RegisterUsage R = calculateRegisterUsage({VF})[0];
5216   // We divide by these constants so assume that we have at least one
5217   // instruction that uses at least one register.
5218   for (auto& pair : R.MaxLocalUsers) {
5219     pair.second = std::max(pair.second, 1U);
5220   }
5221 
5222   // We calculate the interleave count using the following formula.
5223   // Subtract the number of loop invariants from the number of available
5224   // registers. These registers are used by all of the interleaved instances.
5225   // Next, divide the remaining registers by the number of registers that is
5226   // required by the loop, in order to estimate how many parallel instances
5227   // fit without causing spills. All of this is rounded down if necessary to be
5228   // a power of two. We want power of two interleave count to simplify any
5229   // addressing operations or alignment considerations.
5230   // We also want power of two interleave counts to ensure that the induction
5231   // variable of the vector loop wraps to zero, when tail is folded by masking;
5232   // this currently happens when OptForSize, in which case IC is set to 1 above.
5233   unsigned IC = UINT_MAX;
5234 
5235   for (auto& pair : R.MaxLocalUsers) {
5236     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
5237     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
5238                       << " registers of "
5239                       << TTI.getRegisterClassName(pair.first) << " register class\n");
5240     if (VF == 1) {
5241       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
5242         TargetNumRegisters = ForceTargetNumScalarRegs;
5243     } else {
5244       if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
5245         TargetNumRegisters = ForceTargetNumVectorRegs;
5246     }
5247     unsigned MaxLocalUsers = pair.second;
5248     unsigned LoopInvariantRegs = 0;
5249     if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
5250       LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
5251 
5252     unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
5253     // Don't count the induction variable as interleaved.
5254     if (EnableIndVarRegisterHeur) {
5255       TmpIC =
5256           PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
5257                         std::max(1U, (MaxLocalUsers - 1)));
5258     }
5259 
5260     IC = std::min(IC, TmpIC);
5261   }
5262 
5263   // Clamp the interleave ranges to reasonable counts.
5264   unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
5265 
5266   // Check if the user has overridden the max.
5267   if (VF == 1) {
5268     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
5269       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
5270   } else {
5271     if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
5272       MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
5273   }
5274 
5275   // If trip count is known or estimated compile time constant, limit the
5276   // interleave count to be less than the trip count divided by VF.
5277   if (BestKnownTC) {
5278     MaxInterleaveCount = std::min(*BestKnownTC / VF, MaxInterleaveCount);
5279   }
5280 
5281   // If we did not calculate the cost for VF (because the user selected the VF)
5282   // then we calculate the cost of VF here.
5283   if (LoopCost == 0)
5284     LoopCost = expectedCost(VF).first;
5285 
5286   assert(LoopCost && "Non-zero loop cost expected");
5287 
5288   // Clamp the calculated IC to be between the 1 and the max interleave count
5289   // that the target and trip count allows.
5290   if (IC > MaxInterleaveCount)
5291     IC = MaxInterleaveCount;
5292   else if (IC < 1)
5293     IC = 1;
5294 
5295   // Interleave if we vectorized this loop and there is a reduction that could
5296   // benefit from interleaving.
5297   if (VF > 1 && !Legal->getReductionVars()->empty()) {
5298     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
5299     return IC;
5300   }
5301 
5302   // Note that if we've already vectorized the loop we will have done the
5303   // runtime check and so interleaving won't require further checks.
5304   bool InterleavingRequiresRuntimePointerCheck =
5305       (VF == 1 && Legal->getRuntimePointerChecking()->Need);
5306 
5307   // We want to interleave small loops in order to reduce the loop overhead and
5308   // potentially expose ILP opportunities.
5309   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
5310   if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
5311     // We assume that the cost overhead is 1 and we use the cost model
5312     // to estimate the cost of the loop and interleave until the cost of the
5313     // loop overhead is about 5% of the cost of the loop.
5314     unsigned SmallIC =
5315         std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
5316 
5317     // Interleave until store/load ports (estimated by max interleave count) are
5318     // saturated.
5319     unsigned NumStores = Legal->getNumStores();
5320     unsigned NumLoads = Legal->getNumLoads();
5321     unsigned StoresIC = IC / (NumStores ? NumStores : 1);
5322     unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
5323 
5324     // If we have a scalar reduction (vector reductions are already dealt with
5325     // by this point), we can increase the critical path length if the loop
5326     // we're interleaving is inside another loop. Limit, by default to 2, so the
5327     // critical path only gets increased by one reduction operation.
5328     if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
5329       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
5330       SmallIC = std::min(SmallIC, F);
5331       StoresIC = std::min(StoresIC, F);
5332       LoadsIC = std::min(LoadsIC, F);
5333     }
5334 
5335     if (EnableLoadStoreRuntimeInterleave &&
5336         std::max(StoresIC, LoadsIC) > SmallIC) {
5337       LLVM_DEBUG(
5338           dbgs() << "LV: Interleaving to saturate store or load ports.\n");
5339       return std::max(StoresIC, LoadsIC);
5340     }
5341 
5342     LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
5343     return SmallIC;
5344   }
5345 
5346   // Interleave if this is a large loop (small loops are already dealt with by
5347   // this point) that could benefit from interleaving.
5348   bool HasReductions = !Legal->getReductionVars()->empty();
5349   if (TTI.enableAggressiveInterleaving(HasReductions)) {
5350     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
5351     return IC;
5352   }
5353 
5354   LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
5355   return 1;
5356 }
5357 
5358 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<unsigned> VFs)5359 LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
5360   // This function calculates the register usage by measuring the highest number
5361   // of values that are alive at a single location. Obviously, this is a very
5362   // rough estimation. We scan the loop in a topological order in order and
5363   // assign a number to each instruction. We use RPO to ensure that defs are
5364   // met before their users. We assume that each instruction that has in-loop
5365   // users starts an interval. We record every time that an in-loop value is
5366   // used, so we have a list of the first and last occurrences of each
5367   // instruction. Next, we transpose this data structure into a multi map that
5368   // holds the list of intervals that *end* at a specific location. This multi
5369   // map allows us to perform a linear search. We scan the instructions linearly
5370   // and record each time that a new interval starts, by placing it in a set.
5371   // If we find this value in the multi-map then we remove it from the set.
5372   // The max register usage is the maximum size of the set.
5373   // We also search for instructions that are defined outside the loop, but are
5374   // used inside the loop. We need this number separately from the max-interval
5375   // usage number because when we unroll, loop-invariant values do not take
5376   // more register.
5377   LoopBlocksDFS DFS(TheLoop);
5378   DFS.perform(LI);
5379 
5380   RegisterUsage RU;
5381 
5382   // Each 'key' in the map opens a new interval. The values
5383   // of the map are the index of the 'last seen' usage of the
5384   // instruction that is the key.
5385   using IntervalMap = DenseMap<Instruction *, unsigned>;
5386 
5387   // Maps instruction to its index.
5388   SmallVector<Instruction *, 64> IdxToInstr;
5389   // Marks the end of each interval.
5390   IntervalMap EndPoint;
5391   // Saves the list of instruction indices that are used in the loop.
5392   SmallPtrSet<Instruction *, 8> Ends;
5393   // Saves the list of values that are used in the loop but are
5394   // defined outside the loop, such as arguments and constants.
5395   SmallPtrSet<Value *, 8> LoopInvariants;
5396 
5397   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
5398     for (Instruction &I : BB->instructionsWithoutDebug()) {
5399       IdxToInstr.push_back(&I);
5400 
5401       // Save the end location of each USE.
5402       for (Value *U : I.operands()) {
5403         auto *Instr = dyn_cast<Instruction>(U);
5404 
5405         // Ignore non-instruction values such as arguments, constants, etc.
5406         if (!Instr)
5407           continue;
5408 
5409         // If this instruction is outside the loop then record it and continue.
5410         if (!TheLoop->contains(Instr)) {
5411           LoopInvariants.insert(Instr);
5412           continue;
5413         }
5414 
5415         // Overwrite previous end points.
5416         EndPoint[Instr] = IdxToInstr.size();
5417         Ends.insert(Instr);
5418       }
5419     }
5420   }
5421 
5422   // Saves the list of intervals that end with the index in 'key'.
5423   using InstrList = SmallVector<Instruction *, 2>;
5424   DenseMap<unsigned, InstrList> TransposeEnds;
5425 
5426   // Transpose the EndPoints to a list of values that end at each index.
5427   for (auto &Interval : EndPoint)
5428     TransposeEnds[Interval.second].push_back(Interval.first);
5429 
5430   SmallPtrSet<Instruction *, 8> OpenIntervals;
5431 
5432   // Get the size of the widest register.
5433   unsigned MaxSafeDepDist = -1U;
5434   if (Legal->getMaxSafeDepDistBytes() != -1U)
5435     MaxSafeDepDist = Legal->getMaxSafeDepDistBytes() * 8;
5436   unsigned WidestRegister =
5437       std::min(TTI.getRegisterBitWidth(true), MaxSafeDepDist);
5438   const DataLayout &DL = TheFunction->getParent()->getDataLayout();
5439 
5440   SmallVector<RegisterUsage, 8> RUs(VFs.size());
5441   SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
5442 
5443   LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
5444 
5445   // A lambda that gets the register usage for the given type and VF.
5446   auto GetRegUsage = [&DL, WidestRegister](Type *Ty, unsigned VF) {
5447     if (Ty->isTokenTy())
5448       return 0U;
5449     unsigned TypeSize = DL.getTypeSizeInBits(Ty->getScalarType());
5450     return std::max<unsigned>(1, VF * TypeSize / WidestRegister);
5451   };
5452 
5453   for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
5454     Instruction *I = IdxToInstr[i];
5455 
5456     // Remove all of the instructions that end at this location.
5457     InstrList &List = TransposeEnds[i];
5458     for (Instruction *ToRemove : List)
5459       OpenIntervals.erase(ToRemove);
5460 
5461     // Ignore instructions that are never used within the loop.
5462     if (Ends.find(I) == Ends.end())
5463       continue;
5464 
5465     // Skip ignored values.
5466     if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
5467       continue;
5468 
5469     // For each VF find the maximum usage of registers.
5470     for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
5471       // Count the number of live intervals.
5472       SmallMapVector<unsigned, unsigned, 4> RegUsage;
5473 
5474       if (VFs[j] == 1) {
5475         for (auto Inst : OpenIntervals) {
5476           unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5477           if (RegUsage.find(ClassID) == RegUsage.end())
5478             RegUsage[ClassID] = 1;
5479           else
5480             RegUsage[ClassID] += 1;
5481         }
5482       } else {
5483         collectUniformsAndScalars(VFs[j]);
5484         for (auto Inst : OpenIntervals) {
5485           // Skip ignored values for VF > 1.
5486           if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
5487             continue;
5488           if (isScalarAfterVectorization(Inst, VFs[j])) {
5489             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
5490             if (RegUsage.find(ClassID) == RegUsage.end())
5491               RegUsage[ClassID] = 1;
5492             else
5493               RegUsage[ClassID] += 1;
5494           } else {
5495             unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
5496             if (RegUsage.find(ClassID) == RegUsage.end())
5497               RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
5498             else
5499               RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
5500           }
5501         }
5502       }
5503 
5504       for (auto& pair : RegUsage) {
5505         if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
5506           MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
5507         else
5508           MaxUsages[j][pair.first] = pair.second;
5509       }
5510     }
5511 
5512     LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
5513                       << OpenIntervals.size() << '\n');
5514 
5515     // Add the current instruction to the list of open intervals.
5516     OpenIntervals.insert(I);
5517   }
5518 
5519   for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
5520     SmallMapVector<unsigned, unsigned, 4> Invariant;
5521 
5522     for (auto Inst : LoopInvariants) {
5523       unsigned Usage = VFs[i] == 1 ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
5524       unsigned ClassID = TTI.getRegisterClassForType(VFs[i] > 1, Inst->getType());
5525       if (Invariant.find(ClassID) == Invariant.end())
5526         Invariant[ClassID] = Usage;
5527       else
5528         Invariant[ClassID] += Usage;
5529     }
5530 
5531     LLVM_DEBUG({
5532       dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
5533       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
5534              << " item\n";
5535       for (const auto &pair : MaxUsages[i]) {
5536         dbgs() << "LV(REG): RegisterClass: "
5537                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5538                << " registers\n";
5539       }
5540       dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5541              << " item\n";
5542       for (const auto &pair : Invariant) {
5543         dbgs() << "LV(REG): RegisterClass: "
5544                << TTI.getRegisterClassName(pair.first) << ", " << pair.second
5545                << " registers\n";
5546       }
5547     });
5548 
5549     RU.LoopInvariantRegs = Invariant;
5550     RU.MaxLocalUsers = MaxUsages[i];
5551     RUs[i] = RU;
5552   }
5553 
5554   return RUs;
5555 }
5556 
useEmulatedMaskMemRefHack(Instruction * I)5557 bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
5558   // TODO: Cost model for emulated masked load/store is completely
5559   // broken. This hack guides the cost model to use an artificially
5560   // high enough value to practically disable vectorization with such
5561   // operations, except where previously deployed legality hack allowed
5562   // using very low cost values. This is to avoid regressions coming simply
5563   // from moving "masked load/store" check from legality to cost model.
5564   // Masked Load/Gather emulation was previously never allowed.
5565   // Limited number of Masked Store/Scatter emulation was allowed.
5566   assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
5567   return isa<LoadInst>(I) ||
5568          (isa<StoreInst>(I) &&
5569           NumPredStores > NumberOfStoresToPredicate);
5570 }
5571 
collectInstsToScalarize(unsigned VF)5572 void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
5573   // If we aren't vectorizing the loop, or if we've already collected the
5574   // instructions to scalarize, there's nothing to do. Collection may already
5575   // have occurred if we have a user-selected VF and are now computing the
5576   // expected cost for interleaving.
5577   if (VF < 2 || InstsToScalarize.find(VF) != InstsToScalarize.end())
5578     return;
5579 
5580   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
5581   // not profitable to scalarize any instructions, the presence of VF in the
5582   // map will indicate that we've analyzed it already.
5583   ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
5584 
5585   // Find all the instructions that are scalar with predication in the loop and
5586   // determine if it would be better to not if-convert the blocks they are in.
5587   // If so, we also record the instructions to scalarize.
5588   for (BasicBlock *BB : TheLoop->blocks()) {
5589     if (!blockNeedsPredication(BB))
5590       continue;
5591     for (Instruction &I : *BB)
5592       if (isScalarWithPredication(&I)) {
5593         ScalarCostsTy ScalarCosts;
5594         // Do not apply discount logic if hacked cost is needed
5595         // for emulated masked memrefs.
5596         if (!useEmulatedMaskMemRefHack(&I) &&
5597             computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
5598           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
5599         // Remember that BB will remain after vectorization.
5600         PredicatedBBsAfterVectorization.insert(BB);
5601       }
5602   }
5603 }
5604 
computePredInstDiscount(Instruction * PredInst,DenseMap<Instruction *,unsigned> & ScalarCosts,unsigned VF)5605 int LoopVectorizationCostModel::computePredInstDiscount(
5606     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
5607     unsigned VF) {
5608   assert(!isUniformAfterVectorization(PredInst, VF) &&
5609          "Instruction marked uniform-after-vectorization will be predicated");
5610 
5611   // Initialize the discount to zero, meaning that the scalar version and the
5612   // vector version cost the same.
5613   int Discount = 0;
5614 
5615   // Holds instructions to analyze. The instructions we visit are mapped in
5616   // ScalarCosts. Those instructions are the ones that would be scalarized if
5617   // we find that the scalar version costs less.
5618   SmallVector<Instruction *, 8> Worklist;
5619 
5620   // Returns true if the given instruction can be scalarized.
5621   auto canBeScalarized = [&](Instruction *I) -> bool {
5622     // We only attempt to scalarize instructions forming a single-use chain
5623     // from the original predicated block that would otherwise be vectorized.
5624     // Although not strictly necessary, we give up on instructions we know will
5625     // already be scalar to avoid traversing chains that are unlikely to be
5626     // beneficial.
5627     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
5628         isScalarAfterVectorization(I, VF))
5629       return false;
5630 
5631     // If the instruction is scalar with predication, it will be analyzed
5632     // separately. We ignore it within the context of PredInst.
5633     if (isScalarWithPredication(I))
5634       return false;
5635 
5636     // If any of the instruction's operands are uniform after vectorization,
5637     // the instruction cannot be scalarized. This prevents, for example, a
5638     // masked load from being scalarized.
5639     //
5640     // We assume we will only emit a value for lane zero of an instruction
5641     // marked uniform after vectorization, rather than VF identical values.
5642     // Thus, if we scalarize an instruction that uses a uniform, we would
5643     // create uses of values corresponding to the lanes we aren't emitting code
5644     // for. This behavior can be changed by allowing getScalarValue to clone
5645     // the lane zero values for uniforms rather than asserting.
5646     for (Use &U : I->operands())
5647       if (auto *J = dyn_cast<Instruction>(U.get()))
5648         if (isUniformAfterVectorization(J, VF))
5649           return false;
5650 
5651     // Otherwise, we can scalarize the instruction.
5652     return true;
5653   };
5654 
5655   // Compute the expected cost discount from scalarizing the entire expression
5656   // feeding the predicated instruction. We currently only consider expressions
5657   // that are single-use instruction chains.
5658   Worklist.push_back(PredInst);
5659   while (!Worklist.empty()) {
5660     Instruction *I = Worklist.pop_back_val();
5661 
5662     // If we've already analyzed the instruction, there's nothing to do.
5663     if (ScalarCosts.find(I) != ScalarCosts.end())
5664       continue;
5665 
5666     // Compute the cost of the vector instruction. Note that this cost already
5667     // includes the scalarization overhead of the predicated instruction.
5668     unsigned VectorCost = getInstructionCost(I, VF).first;
5669 
5670     // Compute the cost of the scalarized instruction. This cost is the cost of
5671     // the instruction as if it wasn't if-converted and instead remained in the
5672     // predicated block. We will scale this cost by block probability after
5673     // computing the scalarization overhead.
5674     unsigned ScalarCost = VF * getInstructionCost(I, 1).first;
5675 
5676     // Compute the scalarization overhead of needed insertelement instructions
5677     // and phi nodes.
5678     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
5679       ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
5680                                                  true, false);
5681       ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
5682     }
5683 
5684     // Compute the scalarization overhead of needed extractelement
5685     // instructions. For each of the instruction's operands, if the operand can
5686     // be scalarized, add it to the worklist; otherwise, account for the
5687     // overhead.
5688     for (Use &U : I->operands())
5689       if (auto *J = dyn_cast<Instruction>(U.get())) {
5690         assert(VectorType::isValidElementType(J->getType()) &&
5691                "Instruction has non-scalar type");
5692         if (canBeScalarized(J))
5693           Worklist.push_back(J);
5694         else if (needsExtract(J, VF))
5695           ScalarCost += TTI.getScalarizationOverhead(
5696                               ToVectorTy(J->getType(),VF), false, true);
5697       }
5698 
5699     // Scale the total scalar cost by block probability.
5700     ScalarCost /= getReciprocalPredBlockProb();
5701 
5702     // Compute the discount. A non-negative discount means the vector version
5703     // of the instruction costs more, and scalarizing would be beneficial.
5704     Discount += VectorCost - ScalarCost;
5705     ScalarCosts[I] = ScalarCost;
5706   }
5707 
5708   return Discount;
5709 }
5710 
5711 LoopVectorizationCostModel::VectorizationCostTy
expectedCost(unsigned VF)5712 LoopVectorizationCostModel::expectedCost(unsigned VF) {
5713   VectorizationCostTy Cost;
5714 
5715   // For each block.
5716   for (BasicBlock *BB : TheLoop->blocks()) {
5717     VectorizationCostTy BlockCost;
5718 
5719     // For each instruction in the old loop.
5720     for (Instruction &I : BB->instructionsWithoutDebug()) {
5721       // Skip ignored values.
5722       if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
5723           (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
5724         continue;
5725 
5726       VectorizationCostTy C = getInstructionCost(&I, VF);
5727 
5728       // Check if we should override the cost.
5729       if (ForceTargetInstructionCost.getNumOccurrences() > 0)
5730         C.first = ForceTargetInstructionCost;
5731 
5732       BlockCost.first += C.first;
5733       BlockCost.second |= C.second;
5734       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5735                         << " for VF " << VF << " For instruction: " << I
5736                         << '\n');
5737     }
5738 
5739     // If we are vectorizing a predicated block, it will have been
5740     // if-converted. This means that the block's instructions (aside from
5741     // stores and instructions that may divide by zero) will now be
5742     // unconditionally executed. For the scalar case, we may not always execute
5743     // the predicated block. Thus, scale the block's cost by the probability of
5744     // executing it.
5745     if (VF == 1 && blockNeedsPredication(BB))
5746       BlockCost.first /= getReciprocalPredBlockProb();
5747 
5748     Cost.first += BlockCost.first;
5749     Cost.second |= BlockCost.second;
5750   }
5751 
5752   return Cost;
5753 }
5754 
5755 /// Gets Address Access SCEV after verifying that the access pattern
5756 /// is loop invariant except the induction variable dependence.
5757 ///
5758 /// This SCEV can be sent to the Target in order to estimate the address
5759 /// calculation cost.
getAddressAccessSCEV(Value * Ptr,LoopVectorizationLegality * Legal,PredicatedScalarEvolution & PSE,const Loop * TheLoop)5760 static const SCEV *getAddressAccessSCEV(
5761               Value *Ptr,
5762               LoopVectorizationLegality *Legal,
5763               PredicatedScalarEvolution &PSE,
5764               const Loop *TheLoop) {
5765 
5766   auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
5767   if (!Gep)
5768     return nullptr;
5769 
5770   // We are looking for a gep with all loop invariant indices except for one
5771   // which should be an induction variable.
5772   auto SE = PSE.getSE();
5773   unsigned NumOperands = Gep->getNumOperands();
5774   for (unsigned i = 1; i < NumOperands; ++i) {
5775     Value *Opd = Gep->getOperand(i);
5776     if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
5777         !Legal->isInductionVariable(Opd))
5778       return nullptr;
5779   }
5780 
5781   // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5782   return PSE.getSCEV(Ptr);
5783 }
5784 
isStrideMul(Instruction * I,LoopVectorizationLegality * Legal)5785 static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
5786   return Legal->hasStride(I->getOperand(0)) ||
5787          Legal->hasStride(I->getOperand(1));
5788 }
5789 
getMemInstScalarizationCost(Instruction * I,unsigned VF)5790 unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5791                                                                  unsigned VF) {
5792   assert(VF > 1 && "Scalarization cost of instruction implies vectorization.");
5793   Type *ValTy = getMemInstValueType(I);
5794   auto SE = PSE.getSE();
5795 
5796   unsigned AS = getLoadStoreAddressSpace(I);
5797   Value *Ptr = getLoadStorePointerOperand(I);
5798   Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
5799 
5800   // Figure out whether the access is strided and get the stride value
5801   // if it's known in compile time
5802   const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5803 
5804   // Get the cost of the scalar memory instruction and address computation.
5805   unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
5806 
5807   // Don't pass *I here, since it is scalar but will actually be part of a
5808   // vectorized loop where the user of it is a vectorized instruction.
5809   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5810   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
5811                                    Alignment, AS);
5812 
5813   // Get the overhead of the extractelement and insertelement instructions
5814   // we might create due to scalarization.
5815   Cost += getScalarizationOverhead(I, VF);
5816 
5817   // If we have a predicated store, it may not be executed for each vector
5818   // lane. Scale the cost by the probability of executing the predicated
5819   // block.
5820   if (isPredicatedInst(I)) {
5821     Cost /= getReciprocalPredBlockProb();
5822 
5823     if (useEmulatedMaskMemRefHack(I))
5824       // Artificially setting to a high enough value to practically disable
5825       // vectorization with such operations.
5826       Cost = 3000000;
5827   }
5828 
5829   return Cost;
5830 }
5831 
getConsecutiveMemOpCost(Instruction * I,unsigned VF)5832 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5833                                                              unsigned VF) {
5834   Type *ValTy = getMemInstValueType(I);
5835   Type *VectorTy = ToVectorTy(ValTy, VF);
5836   Value *Ptr = getLoadStorePointerOperand(I);
5837   unsigned AS = getLoadStoreAddressSpace(I);
5838   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
5839 
5840   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5841          "Stride should be 1 or -1 for consecutive memory access");
5842   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5843   unsigned Cost = 0;
5844   if (Legal->isMaskRequired(I))
5845     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
5846                                       Alignment ? Alignment->value() : 0, AS);
5847   else
5848     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
5849 
5850   bool Reverse = ConsecutiveStride < 0;
5851   if (Reverse)
5852     Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5853   return Cost;
5854 }
5855 
getUniformMemOpCost(Instruction * I,unsigned VF)5856 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5857                                                          unsigned VF) {
5858   Type *ValTy = getMemInstValueType(I);
5859   Type *VectorTy = ToVectorTy(ValTy, VF);
5860   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5861   unsigned AS = getLoadStoreAddressSpace(I);
5862   if (isa<LoadInst>(I)) {
5863     return TTI.getAddressComputationCost(ValTy) +
5864            TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
5865            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
5866   }
5867   StoreInst *SI = cast<StoreInst>(I);
5868 
5869   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
5870   return TTI.getAddressComputationCost(ValTy) +
5871          TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
5872          (isLoopInvariantStoreValue
5873               ? 0
5874               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
5875                                        VF - 1));
5876 }
5877 
getGatherScatterCost(Instruction * I,unsigned VF)5878 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5879                                                           unsigned VF) {
5880   Type *ValTy = getMemInstValueType(I);
5881   Type *VectorTy = ToVectorTy(ValTy, VF);
5882   const MaybeAlign Alignment = getLoadStoreAlignment(I);
5883   Value *Ptr = getLoadStorePointerOperand(I);
5884 
5885   return TTI.getAddressComputationCost(VectorTy) +
5886          TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
5887                                     Legal->isMaskRequired(I),
5888                                     Alignment ? Alignment->value() : 0);
5889 }
5890 
getInterleaveGroupCost(Instruction * I,unsigned VF)5891 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5892                                                             unsigned VF) {
5893   Type *ValTy = getMemInstValueType(I);
5894   Type *VectorTy = ToVectorTy(ValTy, VF);
5895   unsigned AS = getLoadStoreAddressSpace(I);
5896 
5897   auto Group = getInterleavedAccessGroup(I);
5898   assert(Group && "Fail to get an interleaved access group.");
5899 
5900   unsigned InterleaveFactor = Group->getFactor();
5901   Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
5902 
5903   // Holds the indices of existing members in an interleaved load group.
5904   // An interleaved store group doesn't need this as it doesn't allow gaps.
5905   SmallVector<unsigned, 4> Indices;
5906   if (isa<LoadInst>(I)) {
5907     for (unsigned i = 0; i < InterleaveFactor; i++)
5908       if (Group->getMember(i))
5909         Indices.push_back(i);
5910   }
5911 
5912   // Calculate the cost of the whole interleaved group.
5913   bool UseMaskForGaps =
5914       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
5915   unsigned Cost = TTI.getInterleavedMemoryOpCost(
5916       I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
5917       Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
5918 
5919   if (Group->isReverse()) {
5920     // TODO: Add support for reversed masked interleaved access.
5921     assert(!Legal->isMaskRequired(I) &&
5922            "Reverse masked interleaved access not supported.");
5923     Cost += Group->getNumMembers() *
5924             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
5925   }
5926   return Cost;
5927 }
5928 
getMemoryInstructionCost(Instruction * I,unsigned VF)5929 unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5930                                                               unsigned VF) {
5931   // Calculate scalar cost only. Vectorization cost should be ready at this
5932   // moment.
5933   if (VF == 1) {
5934     Type *ValTy = getMemInstValueType(I);
5935     const MaybeAlign Alignment = getLoadStoreAlignment(I);
5936     unsigned AS = getLoadStoreAddressSpace(I);
5937 
5938     return TTI.getAddressComputationCost(ValTy) +
5939            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
5940   }
5941   return getWideningCost(I, VF);
5942 }
5943 
5944 LoopVectorizationCostModel::VectorizationCostTy
getInstructionCost(Instruction * I,unsigned VF)5945 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
5946   // If we know that this instruction will remain uniform, check the cost of
5947   // the scalar version.
5948   if (isUniformAfterVectorization(I, VF))
5949     VF = 1;
5950 
5951   if (VF > 1 && isProfitableToScalarize(I, VF))
5952     return VectorizationCostTy(InstsToScalarize[VF][I], false);
5953 
5954   // Forced scalars do not have any scalarization overhead.
5955   auto ForcedScalar = ForcedScalars.find(VF);
5956   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
5957     auto InstSet = ForcedScalar->second;
5958     if (InstSet.find(I) != InstSet.end())
5959       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
5960   }
5961 
5962   Type *VectorTy;
5963   unsigned C = getInstructionCost(I, VF, VectorTy);
5964 
5965   bool TypeNotScalarized =
5966       VF > 1 && VectorTy->isVectorTy() && TTI.getNumberOfParts(VectorTy) < VF;
5967   return VectorizationCostTy(C, TypeNotScalarized);
5968 }
5969 
getScalarizationOverhead(Instruction * I,unsigned VF)5970 unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5971                                                               unsigned VF) {
5972 
5973   if (VF == 1)
5974     return 0;
5975 
5976   unsigned Cost = 0;
5977   Type *RetTy = ToVectorTy(I->getType(), VF);
5978   if (!RetTy->isVoidTy() &&
5979       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
5980     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
5981 
5982   // Some targets keep addresses scalar.
5983   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
5984     return Cost;
5985 
5986   // Some targets support efficient element stores.
5987   if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
5988     return Cost;
5989 
5990   // Collect operands to consider.
5991   CallInst *CI = dyn_cast<CallInst>(I);
5992   Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
5993 
5994   // Skip operands that do not require extraction/scalarization and do not incur
5995   // any overhead.
5996   return Cost + TTI.getOperandsScalarizationOverhead(
5997                     filterExtractingOperands(Ops, VF), VF);
5998 }
5999 
setCostBasedWideningDecision(unsigned VF)6000 void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
6001   if (VF == 1)
6002     return;
6003   NumPredStores = 0;
6004   for (BasicBlock *BB : TheLoop->blocks()) {
6005     // For each instruction in the old loop.
6006     for (Instruction &I : *BB) {
6007       Value *Ptr =  getLoadStorePointerOperand(&I);
6008       if (!Ptr)
6009         continue;
6010 
6011       // TODO: We should generate better code and update the cost model for
6012       // predicated uniform stores. Today they are treated as any other
6013       // predicated store (see added test cases in
6014       // invariant-store-vectorization.ll).
6015       if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
6016         NumPredStores++;
6017 
6018       if (Legal->isUniform(Ptr) &&
6019           // Conditional loads and stores should be scalarized and predicated.
6020           // isScalarWithPredication cannot be used here since masked
6021           // gather/scatters are not considered scalar with predication.
6022           !Legal->blockNeedsPredication(I.getParent())) {
6023         // TODO: Avoid replicating loads and stores instead of
6024         // relying on instcombine to remove them.
6025         // Load: Scalar load + broadcast
6026         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
6027         unsigned Cost = getUniformMemOpCost(&I, VF);
6028         setWideningDecision(&I, VF, CM_Scalarize, Cost);
6029         continue;
6030       }
6031 
6032       // We assume that widening is the best solution when possible.
6033       if (memoryInstructionCanBeWidened(&I, VF)) {
6034         unsigned Cost = getConsecutiveMemOpCost(&I, VF);
6035         int ConsecutiveStride =
6036                Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
6037         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
6038                "Expected consecutive stride.");
6039         InstWidening Decision =
6040             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
6041         setWideningDecision(&I, VF, Decision, Cost);
6042         continue;
6043       }
6044 
6045       // Choose between Interleaving, Gather/Scatter or Scalarization.
6046       unsigned InterleaveCost = std::numeric_limits<unsigned>::max();
6047       unsigned NumAccesses = 1;
6048       if (isAccessInterleaved(&I)) {
6049         auto Group = getInterleavedAccessGroup(&I);
6050         assert(Group && "Fail to get an interleaved access group.");
6051 
6052         // Make one decision for the whole group.
6053         if (getWideningDecision(&I, VF) != CM_Unknown)
6054           continue;
6055 
6056         NumAccesses = Group->getNumMembers();
6057         if (interleavedAccessCanBeWidened(&I, VF))
6058           InterleaveCost = getInterleaveGroupCost(&I, VF);
6059       }
6060 
6061       unsigned GatherScatterCost =
6062           isLegalGatherOrScatter(&I)
6063               ? getGatherScatterCost(&I, VF) * NumAccesses
6064               : std::numeric_limits<unsigned>::max();
6065 
6066       unsigned ScalarizationCost =
6067           getMemInstScalarizationCost(&I, VF) * NumAccesses;
6068 
6069       // Choose better solution for the current VF,
6070       // write down this decision and use it during vectorization.
6071       unsigned Cost;
6072       InstWidening Decision;
6073       if (InterleaveCost <= GatherScatterCost &&
6074           InterleaveCost < ScalarizationCost) {
6075         Decision = CM_Interleave;
6076         Cost = InterleaveCost;
6077       } else if (GatherScatterCost < ScalarizationCost) {
6078         Decision = CM_GatherScatter;
6079         Cost = GatherScatterCost;
6080       } else {
6081         Decision = CM_Scalarize;
6082         Cost = ScalarizationCost;
6083       }
6084       // If the instructions belongs to an interleave group, the whole group
6085       // receives the same decision. The whole group receives the cost, but
6086       // the cost will actually be assigned to one instruction.
6087       if (auto Group = getInterleavedAccessGroup(&I))
6088         setWideningDecision(Group, VF, Decision, Cost);
6089       else
6090         setWideningDecision(&I, VF, Decision, Cost);
6091     }
6092   }
6093 
6094   // Make sure that any load of address and any other address computation
6095   // remains scalar unless there is gather/scatter support. This avoids
6096   // inevitable extracts into address registers, and also has the benefit of
6097   // activating LSR more, since that pass can't optimize vectorized
6098   // addresses.
6099   if (TTI.prefersVectorizedAddressing())
6100     return;
6101 
6102   // Start with all scalar pointer uses.
6103   SmallPtrSet<Instruction *, 8> AddrDefs;
6104   for (BasicBlock *BB : TheLoop->blocks())
6105     for (Instruction &I : *BB) {
6106       Instruction *PtrDef =
6107         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
6108       if (PtrDef && TheLoop->contains(PtrDef) &&
6109           getWideningDecision(&I, VF) != CM_GatherScatter)
6110         AddrDefs.insert(PtrDef);
6111     }
6112 
6113   // Add all instructions used to generate the addresses.
6114   SmallVector<Instruction *, 4> Worklist;
6115   for (auto *I : AddrDefs)
6116     Worklist.push_back(I);
6117   while (!Worklist.empty()) {
6118     Instruction *I = Worklist.pop_back_val();
6119     for (auto &Op : I->operands())
6120       if (auto *InstOp = dyn_cast<Instruction>(Op))
6121         if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
6122             AddrDefs.insert(InstOp).second)
6123           Worklist.push_back(InstOp);
6124   }
6125 
6126   for (auto *I : AddrDefs) {
6127     if (isa<LoadInst>(I)) {
6128       // Setting the desired widening decision should ideally be handled in
6129       // by cost functions, but since this involves the task of finding out
6130       // if the loaded register is involved in an address computation, it is
6131       // instead changed here when we know this is the case.
6132       InstWidening Decision = getWideningDecision(I, VF);
6133       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
6134         // Scalarize a widened load of address.
6135         setWideningDecision(I, VF, CM_Scalarize,
6136                             (VF * getMemoryInstructionCost(I, 1)));
6137       else if (auto Group = getInterleavedAccessGroup(I)) {
6138         // Scalarize an interleave group of address loads.
6139         for (unsigned I = 0; I < Group->getFactor(); ++I) {
6140           if (Instruction *Member = Group->getMember(I))
6141             setWideningDecision(Member, VF, CM_Scalarize,
6142                                 (VF * getMemoryInstructionCost(Member, 1)));
6143         }
6144       }
6145     } else
6146       // Make sure I gets scalarized and a cost estimate without
6147       // scalarization overhead.
6148       ForcedScalars[VF].insert(I);
6149   }
6150 }
6151 
getInstructionCost(Instruction * I,unsigned VF,Type * & VectorTy)6152 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6153                                                         unsigned VF,
6154                                                         Type *&VectorTy) {
6155   Type *RetTy = I->getType();
6156   if (canTruncateToMinimalBitwidth(I, VF))
6157     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
6158   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
6159   auto SE = PSE.getSE();
6160 
6161   // TODO: We need to estimate the cost of intrinsic calls.
6162   switch (I->getOpcode()) {
6163   case Instruction::GetElementPtr:
6164     // We mark this instruction as zero-cost because the cost of GEPs in
6165     // vectorized code depends on whether the corresponding memory instruction
6166     // is scalarized or not. Therefore, we handle GEPs with the memory
6167     // instruction cost.
6168     return 0;
6169   case Instruction::Br: {
6170     // In cases of scalarized and predicated instructions, there will be VF
6171     // predicated blocks in the vectorized loop. Each branch around these
6172     // blocks requires also an extract of its vector compare i1 element.
6173     bool ScalarPredicatedBB = false;
6174     BranchInst *BI = cast<BranchInst>(I);
6175     if (VF > 1 && BI->isConditional() &&
6176         (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
6177              PredicatedBBsAfterVectorization.end() ||
6178          PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
6179              PredicatedBBsAfterVectorization.end()))
6180       ScalarPredicatedBB = true;
6181 
6182     if (ScalarPredicatedBB) {
6183       // Return cost for branches around scalarized and predicated blocks.
6184       Type *Vec_i1Ty =
6185           VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
6186       return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
6187               (TTI.getCFInstrCost(Instruction::Br) * VF));
6188     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
6189       // The back-edge branch will remain, as will all scalar branches.
6190       return TTI.getCFInstrCost(Instruction::Br);
6191     else
6192       // This branch will be eliminated by if-conversion.
6193       return 0;
6194     // Note: We currently assume zero cost for an unconditional branch inside
6195     // a predicated block since it will become a fall-through, although we
6196     // may decide in the future to call TTI for all branches.
6197   }
6198   case Instruction::PHI: {
6199     auto *Phi = cast<PHINode>(I);
6200 
6201     // First-order recurrences are replaced by vector shuffles inside the loop.
6202     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
6203     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
6204       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
6205                                 VectorTy, VF - 1, VectorType::get(RetTy, 1));
6206 
6207     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6208     // converted into select instructions. We require N - 1 selects per phi
6209     // node, where N is the number of incoming values.
6210     if (VF > 1 && Phi->getParent() != TheLoop->getHeader())
6211       return (Phi->getNumIncomingValues() - 1) *
6212              TTI.getCmpSelInstrCost(
6213                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
6214                  ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
6215 
6216     return TTI.getCFInstrCost(Instruction::PHI);
6217   }
6218   case Instruction::UDiv:
6219   case Instruction::SDiv:
6220   case Instruction::URem:
6221   case Instruction::SRem:
6222     // If we have a predicated instruction, it may not be executed for each
6223     // vector lane. Get the scalarization cost and scale this amount by the
6224     // probability of executing the predicated block. If the instruction is not
6225     // predicated, we fall through to the next case.
6226     if (VF > 1 && isScalarWithPredication(I)) {
6227       unsigned Cost = 0;
6228 
6229       // These instructions have a non-void type, so account for the phi nodes
6230       // that we will create. This cost is likely to be zero. The phi node
6231       // cost, if any, should be scaled by the block probability because it
6232       // models a copy at the end of each predicated block.
6233       Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
6234 
6235       // The cost of the non-predicated instruction.
6236       Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
6237 
6238       // The cost of insertelement and extractelement instructions needed for
6239       // scalarization.
6240       Cost += getScalarizationOverhead(I, VF);
6241 
6242       // Scale the cost by the probability of executing the predicated blocks.
6243       // This assumes the predicated block for each vector lane is equally
6244       // likely.
6245       return Cost / getReciprocalPredBlockProb();
6246     }
6247     LLVM_FALLTHROUGH;
6248   case Instruction::Add:
6249   case Instruction::FAdd:
6250   case Instruction::Sub:
6251   case Instruction::FSub:
6252   case Instruction::Mul:
6253   case Instruction::FMul:
6254   case Instruction::FDiv:
6255   case Instruction::FRem:
6256   case Instruction::Shl:
6257   case Instruction::LShr:
6258   case Instruction::AShr:
6259   case Instruction::And:
6260   case Instruction::Or:
6261   case Instruction::Xor: {
6262     // Since we will replace the stride by 1 the multiplication should go away.
6263     if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
6264       return 0;
6265     // Certain instructions can be cheaper to vectorize if they have a constant
6266     // second vector operand. One example of this are shifts on x86.
6267     Value *Op2 = I->getOperand(1);
6268     TargetTransformInfo::OperandValueProperties Op2VP;
6269     TargetTransformInfo::OperandValueKind Op2VK =
6270         TTI.getOperandInfo(Op2, Op2VP);
6271     if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
6272       Op2VK = TargetTransformInfo::OK_UniformValue;
6273 
6274     SmallVector<const Value *, 4> Operands(I->operand_values());
6275     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6276     return N * TTI.getArithmeticInstrCost(
6277                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6278                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
6279   }
6280   case Instruction::FNeg: {
6281     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6282     return N * TTI.getArithmeticInstrCost(
6283                    I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
6284                    TargetTransformInfo::OK_AnyValue,
6285                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
6286                    I->getOperand(0), I);
6287   }
6288   case Instruction::Select: {
6289     SelectInst *SI = cast<SelectInst>(I);
6290     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
6291     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
6292     Type *CondTy = SI->getCondition()->getType();
6293     if (!ScalarCond)
6294       CondTy = VectorType::get(CondTy, VF);
6295 
6296     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
6297   }
6298   case Instruction::ICmp:
6299   case Instruction::FCmp: {
6300     Type *ValTy = I->getOperand(0)->getType();
6301     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
6302     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
6303       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
6304     VectorTy = ToVectorTy(ValTy, VF);
6305     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
6306   }
6307   case Instruction::Store:
6308   case Instruction::Load: {
6309     unsigned Width = VF;
6310     if (Width > 1) {
6311       InstWidening Decision = getWideningDecision(I, Width);
6312       assert(Decision != CM_Unknown &&
6313              "CM decision should be taken at this point");
6314       if (Decision == CM_Scalarize)
6315         Width = 1;
6316     }
6317     VectorTy = ToVectorTy(getMemInstValueType(I), Width);
6318     return getMemoryInstructionCost(I, VF);
6319   }
6320   case Instruction::ZExt:
6321   case Instruction::SExt:
6322   case Instruction::FPToUI:
6323   case Instruction::FPToSI:
6324   case Instruction::FPExt:
6325   case Instruction::PtrToInt:
6326   case Instruction::IntToPtr:
6327   case Instruction::SIToFP:
6328   case Instruction::UIToFP:
6329   case Instruction::Trunc:
6330   case Instruction::FPTrunc:
6331   case Instruction::BitCast: {
6332     // We optimize the truncation of induction variables having constant
6333     // integer steps. The cost of these truncations is the same as the scalar
6334     // operation.
6335     if (isOptimizableIVTruncate(I, VF)) {
6336       auto *Trunc = cast<TruncInst>(I);
6337       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
6338                                   Trunc->getSrcTy(), Trunc);
6339     }
6340 
6341     Type *SrcScalarTy = I->getOperand(0)->getType();
6342     Type *SrcVecTy =
6343         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
6344     if (canTruncateToMinimalBitwidth(I, VF)) {
6345       // This cast is going to be shrunk. This may remove the cast or it might
6346       // turn it into slightly different cast. For example, if MinBW == 16,
6347       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
6348       //
6349       // Calculate the modified src and dest types.
6350       Type *MinVecTy = VectorTy;
6351       if (I->getOpcode() == Instruction::Trunc) {
6352         SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
6353         VectorTy =
6354             largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6355       } else if (I->getOpcode() == Instruction::ZExt ||
6356                  I->getOpcode() == Instruction::SExt) {
6357         SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
6358         VectorTy =
6359             smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
6360       }
6361     }
6362 
6363     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
6364     return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
6365   }
6366   case Instruction::Call: {
6367     bool NeedToScalarize;
6368     CallInst *CI = cast<CallInst>(I);
6369     unsigned CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
6370     if (getVectorIntrinsicIDForCall(CI, TLI))
6371       return std::min(CallCost, getVectorIntrinsicCost(CI, VF));
6372     return CallCost;
6373   }
6374   default:
6375     // The cost of executing VF copies of the scalar instruction. This opcode
6376     // is unknown. Assume that it is the same as 'mul'.
6377     return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
6378            getScalarizationOverhead(I, VF);
6379   } // end of switch.
6380 }
6381 
6382 char LoopVectorize::ID = 0;
6383 
6384 static const char lv_name[] = "Loop Vectorization";
6385 
6386 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
6387 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
6388 INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
6389 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6390 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
6391 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
6392 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
6393 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
6394 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
6395 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
6396 INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
6397 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
6398 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
6399 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
6400 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
6401 
6402 namespace llvm {
6403 
createLoopVectorizePass()6404 Pass *createLoopVectorizePass() { return new LoopVectorize(); }
6405 
createLoopVectorizePass(bool InterleaveOnlyWhenForced,bool VectorizeOnlyWhenForced)6406 Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
6407                               bool VectorizeOnlyWhenForced) {
6408   return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
6409 }
6410 
6411 } // end namespace llvm
6412 
isConsecutiveLoadOrStore(Instruction * Inst)6413 bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
6414   // Check if the pointer operand of a load or store instruction is
6415   // consecutive.
6416   if (auto *Ptr = getLoadStorePointerOperand(Inst))
6417     return Legal->isConsecutivePtr(Ptr);
6418   return false;
6419 }
6420 
collectValuesToIgnore()6421 void LoopVectorizationCostModel::collectValuesToIgnore() {
6422   // Ignore ephemeral values.
6423   CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
6424 
6425   // Ignore type-promoting instructions we identified during reduction
6426   // detection.
6427   for (auto &Reduction : *Legal->getReductionVars()) {
6428     RecurrenceDescriptor &RedDes = Reduction.second;
6429     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6430     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6431   }
6432   // Ignore type-casting instructions we identified during induction
6433   // detection.
6434   for (auto &Induction : *Legal->getInductionVars()) {
6435     InductionDescriptor &IndDes = Induction.second;
6436     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6437     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
6438   }
6439 }
6440 
6441 // TODO: we could return a pair of values that specify the max VF and
6442 // min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6443 // `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6444 // doesn't have a cost model that can choose which plan to execute if
6445 // more than one is generated.
determineVPlanVF(const unsigned WidestVectorRegBits,LoopVectorizationCostModel & CM)6446 static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6447                                  LoopVectorizationCostModel &CM) {
6448   unsigned WidestType;
6449   std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
6450   return WidestVectorRegBits / WidestType;
6451 }
6452 
6453 VectorizationFactor
planInVPlanNativePath(unsigned UserVF)6454 LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
6455   unsigned VF = UserVF;
6456   // Outer loop handling: They may require CFG and instruction level
6457   // transformations before even evaluating whether vectorization is profitable.
6458   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6459   // the vectorization pipeline.
6460   if (!OrigLoop->empty()) {
6461     // If the user doesn't provide a vectorization factor, determine a
6462     // reasonable one.
6463     if (!UserVF) {
6464       VF = determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM);
6465       LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6466 
6467       // Make sure we have a VF > 1 for stress testing.
6468       if (VPlanBuildStressTest && VF < 2) {
6469         LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6470                           << "overriding computed VF.\n");
6471         VF = 4;
6472       }
6473     }
6474     assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6475     assert(isPowerOf2_32(VF) && "VF needs to be a power of two");
6476     LLVM_DEBUG(dbgs() << "LV: Using " << (UserVF ? "user " : "") << "VF " << VF
6477                       << " to build VPlans.\n");
6478     buildVPlans(VF, VF);
6479 
6480     // For VPlan build stress testing, we bail out after VPlan construction.
6481     if (VPlanBuildStressTest)
6482       return VectorizationFactor::Disabled();
6483 
6484     return {VF, 0};
6485   }
6486 
6487   LLVM_DEBUG(
6488       dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6489                 "VPlan-native path.\n");
6490   return VectorizationFactor::Disabled();
6491 }
6492 
plan(unsigned UserVF)6493 Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
6494   assert(OrigLoop->empty() && "Inner loop expected.");
6495   Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
6496   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6497     return None;
6498 
6499   // Invalidate interleave groups if all blocks of loop will be predicated.
6500   if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
6501       !useMaskedInterleavedAccesses(*TTI)) {
6502     LLVM_DEBUG(
6503         dbgs()
6504         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6505            "which requires masked-interleaved support.\n");
6506     CM.InterleaveInfo.reset();
6507   }
6508 
6509   if (UserVF) {
6510     LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6511     assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
6512     // Collect the instructions (and their associated costs) that will be more
6513     // profitable to scalarize.
6514     CM.selectUserVectorizationFactor(UserVF);
6515     buildVPlansWithVPRecipes(UserVF, UserVF);
6516     LLVM_DEBUG(printPlans(dbgs()));
6517     return {{UserVF, 0}};
6518   }
6519 
6520   unsigned MaxVF = MaybeMaxVF.getValue();
6521   assert(MaxVF != 0 && "MaxVF is zero.");
6522 
6523   for (unsigned VF = 1; VF <= MaxVF; VF *= 2) {
6524     // Collect Uniform and Scalar instructions after vectorization with VF.
6525     CM.collectUniformsAndScalars(VF);
6526 
6527     // Collect the instructions (and their associated costs) that will be more
6528     // profitable to scalarize.
6529     if (VF > 1)
6530       CM.collectInstsToScalarize(VF);
6531   }
6532 
6533   buildVPlansWithVPRecipes(1, MaxVF);
6534   LLVM_DEBUG(printPlans(dbgs()));
6535   if (MaxVF == 1)
6536     return VectorizationFactor::Disabled();
6537 
6538   // Select the optimal vectorization factor.
6539   return CM.selectVectorizationFactor(MaxVF);
6540 }
6541 
setBestPlan(unsigned VF,unsigned UF)6542 void LoopVectorizationPlanner::setBestPlan(unsigned VF, unsigned UF) {
6543   LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
6544                     << '\n');
6545   BestVF = VF;
6546   BestUF = UF;
6547 
6548   erase_if(VPlans, [VF](const VPlanPtr &Plan) {
6549     return !Plan->hasVF(VF);
6550   });
6551   assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
6552 }
6553 
executePlan(InnerLoopVectorizer & ILV,DominatorTree * DT)6554 void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
6555                                            DominatorTree *DT) {
6556   // Perform the actual loop transformation.
6557 
6558   // 1. Create a new empty loop. Unlink the old loop and connect the new one.
6559   VPCallbackILV CallbackILV(ILV);
6560 
6561   VPTransformState State{BestVF, BestUF,      LI,
6562                          DT,     ILV.Builder, ILV.VectorLoopValueMap,
6563                          &ILV,   CallbackILV};
6564   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
6565   State.TripCount = ILV.getOrCreateTripCount(nullptr);
6566 
6567   //===------------------------------------------------===//
6568   //
6569   // Notice: any optimization or new instruction that go
6570   // into the code below should also be implemented in
6571   // the cost-model.
6572   //
6573   //===------------------------------------------------===//
6574 
6575   // 2. Copy and widen instructions from the old loop into the new loop.
6576   assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
6577   VPlans.front()->execute(&State);
6578 
6579   // 3. Fix the vectorized code: take care of header phi's, live-outs,
6580   //    predication, updating analyses.
6581   ILV.fixVectorizedLoop();
6582 }
6583 
collectTriviallyDeadInstructions(SmallPtrSetImpl<Instruction * > & DeadInstructions)6584 void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
6585     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
6586   BasicBlock *Latch = OrigLoop->getLoopLatch();
6587 
6588   // We create new control-flow for the vectorized loop, so the original
6589   // condition will be dead after vectorization if it's only used by the
6590   // branch.
6591   auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
6592   if (Cmp && Cmp->hasOneUse())
6593     DeadInstructions.insert(Cmp);
6594 
6595   // We create new "steps" for induction variable updates to which the original
6596   // induction variables map. An original update instruction will be dead if
6597   // all its users except the induction variable are dead.
6598   for (auto &Induction : *Legal->getInductionVars()) {
6599     PHINode *Ind = Induction.first;
6600     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
6601     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
6602           return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
6603                                  DeadInstructions.end();
6604         }))
6605       DeadInstructions.insert(IndUpdate);
6606 
6607     // We record as "Dead" also the type-casting instructions we had identified
6608     // during induction analysis. We don't need any handling for them in the
6609     // vectorized loop because we have proven that, under a proper runtime
6610     // test guarding the vectorized loop, the value of the phi, and the casted
6611     // value of the phi, are the same. The last instruction in this casting chain
6612     // will get its scalar/vector/widened def from the scalar/vector/widened def
6613     // of the respective phi node. Any other casts in the induction def-use chain
6614     // have no other uses outside the phi update chain, and will be ignored.
6615     InductionDescriptor &IndDes = Induction.second;
6616     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6617     DeadInstructions.insert(Casts.begin(), Casts.end());
6618   }
6619 }
6620 
reverseVector(Value * Vec)6621 Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
6622 
getBroadcastInstrs(Value * V)6623 Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
6624 
getStepVector(Value * Val,int StartIdx,Value * Step,Instruction::BinaryOps BinOp)6625 Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
6626                                         Instruction::BinaryOps BinOp) {
6627   // When unrolling and the VF is 1, we only need to add a simple scalar.
6628   Type *Ty = Val->getType();
6629   assert(!Ty->isVectorTy() && "Val must be a scalar");
6630 
6631   if (Ty->isFloatingPointTy()) {
6632     Constant *C = ConstantFP::get(Ty, (double)StartIdx);
6633 
6634     // Floating point operations had to be 'fast' to enable the unrolling.
6635     Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
6636     return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
6637   }
6638   Constant *C = ConstantInt::get(Ty, StartIdx);
6639   return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
6640 }
6641 
AddRuntimeUnrollDisableMetaData(Loop * L)6642 static void AddRuntimeUnrollDisableMetaData(Loop *L) {
6643   SmallVector<Metadata *, 4> MDs;
6644   // Reserve first location for self reference to the LoopID metadata node.
6645   MDs.push_back(nullptr);
6646   bool IsUnrollMetadata = false;
6647   MDNode *LoopID = L->getLoopID();
6648   if (LoopID) {
6649     // First find existing loop unrolling disable metadata.
6650     for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
6651       auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
6652       if (MD) {
6653         const auto *S = dyn_cast<MDString>(MD->getOperand(0));
6654         IsUnrollMetadata =
6655             S && S->getString().startswith("llvm.loop.unroll.disable");
6656       }
6657       MDs.push_back(LoopID->getOperand(i));
6658     }
6659   }
6660 
6661   if (!IsUnrollMetadata) {
6662     // Add runtime unroll disable metadata.
6663     LLVMContext &Context = L->getHeader()->getContext();
6664     SmallVector<Metadata *, 1> DisableOperands;
6665     DisableOperands.push_back(
6666         MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
6667     MDNode *DisableNode = MDNode::get(Context, DisableOperands);
6668     MDs.push_back(DisableNode);
6669     MDNode *NewLoopID = MDNode::get(Context, MDs);
6670     // Set operand 0 to refer to the loop id itself.
6671     NewLoopID->replaceOperandWith(0, NewLoopID);
6672     L->setLoopID(NewLoopID);
6673   }
6674 }
6675 
getDecisionAndClampRange(const std::function<bool (unsigned)> & Predicate,VFRange & Range)6676 bool LoopVectorizationPlanner::getDecisionAndClampRange(
6677     const std::function<bool(unsigned)> &Predicate, VFRange &Range) {
6678   assert(Range.End > Range.Start && "Trying to test an empty VF range.");
6679   bool PredicateAtRangeStart = Predicate(Range.Start);
6680 
6681   for (unsigned TmpVF = Range.Start * 2; TmpVF < Range.End; TmpVF *= 2)
6682     if (Predicate(TmpVF) != PredicateAtRangeStart) {
6683       Range.End = TmpVF;
6684       break;
6685     }
6686 
6687   return PredicateAtRangeStart;
6688 }
6689 
6690 /// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
6691 /// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
6692 /// of VF's starting at a given VF and extending it as much as possible. Each
6693 /// vectorization decision can potentially shorten this sub-range during
6694 /// buildVPlan().
buildVPlans(unsigned MinVF,unsigned MaxVF)6695 void LoopVectorizationPlanner::buildVPlans(unsigned MinVF, unsigned MaxVF) {
6696   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
6697     VFRange SubRange = {VF, MaxVF + 1};
6698     VPlans.push_back(buildVPlan(SubRange));
6699     VF = SubRange.End;
6700   }
6701 }
6702 
createEdgeMask(BasicBlock * Src,BasicBlock * Dst,VPlanPtr & Plan)6703 VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
6704                                          VPlanPtr &Plan) {
6705   assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
6706 
6707   // Look for cached value.
6708   std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
6709   EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
6710   if (ECEntryIt != EdgeMaskCache.end())
6711     return ECEntryIt->second;
6712 
6713   VPValue *SrcMask = createBlockInMask(Src, Plan);
6714 
6715   // The terminator has to be a branch inst!
6716   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
6717   assert(BI && "Unexpected terminator found");
6718 
6719   if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
6720     return EdgeMaskCache[Edge] = SrcMask;
6721 
6722   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
6723   assert(EdgeMask && "No Edge Mask found for condition");
6724 
6725   if (BI->getSuccessor(0) != Dst)
6726     EdgeMask = Builder.createNot(EdgeMask);
6727 
6728   if (SrcMask) // Otherwise block in-mask is all-one, no need to AND.
6729     EdgeMask = Builder.createAnd(EdgeMask, SrcMask);
6730 
6731   return EdgeMaskCache[Edge] = EdgeMask;
6732 }
6733 
createBlockInMask(BasicBlock * BB,VPlanPtr & Plan)6734 VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
6735   assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
6736 
6737   // Look for cached value.
6738   BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
6739   if (BCEntryIt != BlockMaskCache.end())
6740     return BCEntryIt->second;
6741 
6742   // All-one mask is modelled as no-mask following the convention for masked
6743   // load/store/gather/scatter. Initialize BlockMask to no-mask.
6744   VPValue *BlockMask = nullptr;
6745 
6746   if (OrigLoop->getHeader() == BB) {
6747     if (!CM.blockNeedsPredication(BB))
6748       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
6749 
6750     // Introduce the early-exit compare IV <= BTC to form header block mask.
6751     // This is used instead of IV < TC because TC may wrap, unlike BTC.
6752     VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
6753     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
6754     BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
6755     return BlockMaskCache[BB] = BlockMask;
6756   }
6757 
6758   // This is the block mask. We OR all incoming edges.
6759   for (auto *Predecessor : predecessors(BB)) {
6760     VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
6761     if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
6762       return BlockMaskCache[BB] = EdgeMask;
6763 
6764     if (!BlockMask) { // BlockMask has its initialized nullptr value.
6765       BlockMask = EdgeMask;
6766       continue;
6767     }
6768 
6769     BlockMask = Builder.createOr(BlockMask, EdgeMask);
6770   }
6771 
6772   return BlockMaskCache[BB] = BlockMask;
6773 }
6774 
6775 VPWidenMemoryInstructionRecipe *
tryToWidenMemory(Instruction * I,VFRange & Range,VPlanPtr & Plan)6776 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
6777                                   VPlanPtr &Plan) {
6778   if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
6779     return nullptr;
6780 
6781   auto willWiden = [&](unsigned VF) -> bool {
6782     if (VF == 1)
6783       return false;
6784     LoopVectorizationCostModel::InstWidening Decision =
6785         CM.getWideningDecision(I, VF);
6786     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
6787            "CM decision should be taken at this point.");
6788     if (Decision == LoopVectorizationCostModel::CM_Interleave)
6789       return true;
6790     if (CM.isScalarAfterVectorization(I, VF) ||
6791         CM.isProfitableToScalarize(I, VF))
6792       return false;
6793     return Decision != LoopVectorizationCostModel::CM_Scalarize;
6794   };
6795 
6796   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6797     return nullptr;
6798 
6799   VPValue *Mask = nullptr;
6800   if (Legal->isMaskRequired(I))
6801     Mask = createBlockInMask(I->getParent(), Plan);
6802 
6803   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
6804   return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
6805 }
6806 
6807 VPWidenIntOrFpInductionRecipe *
tryToOptimizeInduction(Instruction * I,VFRange & Range)6808 VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
6809   if (PHINode *Phi = dyn_cast<PHINode>(I)) {
6810     // Check if this is an integer or fp induction. If so, build the recipe that
6811     // produces its scalar and vector values.
6812     InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
6813     if (II.getKind() == InductionDescriptor::IK_IntInduction ||
6814         II.getKind() == InductionDescriptor::IK_FpInduction)
6815       return new VPWidenIntOrFpInductionRecipe(Phi);
6816 
6817     return nullptr;
6818   }
6819 
6820   // Optimize the special case where the source is a constant integer
6821   // induction variable. Notice that we can only optimize the 'trunc' case
6822   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
6823   // (c) other casts depend on pointer size.
6824 
6825   // Determine whether \p K is a truncation based on an induction variable that
6826   // can be optimized.
6827   auto isOptimizableIVTruncate =
6828       [&](Instruction *K) -> std::function<bool(unsigned)> {
6829     return
6830         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
6831   };
6832 
6833   if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
6834                                isOptimizableIVTruncate(I), Range))
6835     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
6836                                              cast<TruncInst>(I));
6837   return nullptr;
6838 }
6839 
tryToBlend(Instruction * I,VPlanPtr & Plan)6840 VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
6841   PHINode *Phi = dyn_cast<PHINode>(I);
6842   if (!Phi || Phi->getParent() == OrigLoop->getHeader())
6843     return nullptr;
6844 
6845   // We know that all PHIs in non-header blocks are converted into selects, so
6846   // we don't have to worry about the insertion order and we can just use the
6847   // builder. At this point we generate the predication tree. There may be
6848   // duplications since this is a simple recursive scan, but future
6849   // optimizations will clean it up.
6850 
6851   SmallVector<VPValue *, 2> Masks;
6852   unsigned NumIncoming = Phi->getNumIncomingValues();
6853   for (unsigned In = 0; In < NumIncoming; In++) {
6854     VPValue *EdgeMask =
6855       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
6856     assert((EdgeMask || NumIncoming == 1) &&
6857            "Multiple predecessors with one having a full mask");
6858     if (EdgeMask)
6859       Masks.push_back(EdgeMask);
6860   }
6861   return new VPBlendRecipe(Phi, Masks);
6862 }
6863 
tryToWiden(Instruction * I,VPBasicBlock * VPBB,VFRange & Range)6864 bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
6865                                  VFRange &Range) {
6866 
6867   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6868       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6869 
6870   if (IsPredicated)
6871     return false;
6872 
6873   auto IsVectorizableOpcode = [](unsigned Opcode) {
6874     switch (Opcode) {
6875     case Instruction::Add:
6876     case Instruction::And:
6877     case Instruction::AShr:
6878     case Instruction::BitCast:
6879     case Instruction::Br:
6880     case Instruction::Call:
6881     case Instruction::FAdd:
6882     case Instruction::FCmp:
6883     case Instruction::FDiv:
6884     case Instruction::FMul:
6885     case Instruction::FNeg:
6886     case Instruction::FPExt:
6887     case Instruction::FPToSI:
6888     case Instruction::FPToUI:
6889     case Instruction::FPTrunc:
6890     case Instruction::FRem:
6891     case Instruction::FSub:
6892     case Instruction::ICmp:
6893     case Instruction::IntToPtr:
6894     case Instruction::Load:
6895     case Instruction::LShr:
6896     case Instruction::Mul:
6897     case Instruction::Or:
6898     case Instruction::PHI:
6899     case Instruction::PtrToInt:
6900     case Instruction::SDiv:
6901     case Instruction::Select:
6902     case Instruction::SExt:
6903     case Instruction::Shl:
6904     case Instruction::SIToFP:
6905     case Instruction::SRem:
6906     case Instruction::Store:
6907     case Instruction::Sub:
6908     case Instruction::Trunc:
6909     case Instruction::UDiv:
6910     case Instruction::UIToFP:
6911     case Instruction::URem:
6912     case Instruction::Xor:
6913     case Instruction::ZExt:
6914       return true;
6915     }
6916     return false;
6917   };
6918 
6919   if (!IsVectorizableOpcode(I->getOpcode()))
6920     return false;
6921 
6922   if (CallInst *CI = dyn_cast<CallInst>(I)) {
6923     Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6924     if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
6925                ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
6926       return false;
6927   }
6928 
6929   auto willWiden = [&](unsigned VF) -> bool {
6930     if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
6931                              CM.isProfitableToScalarize(I, VF)))
6932       return false;
6933     if (CallInst *CI = dyn_cast<CallInst>(I)) {
6934       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
6935       // The following case may be scalarized depending on the VF.
6936       // The flag shows whether we use Intrinsic or a usual Call for vectorized
6937       // version of the instruction.
6938       // Is it beneficial to perform intrinsic call compared to lib call?
6939       bool NeedToScalarize;
6940       unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
6941       bool UseVectorIntrinsic =
6942           ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
6943       return UseVectorIntrinsic || !NeedToScalarize;
6944     }
6945     if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
6946       assert(CM.getWideningDecision(I, VF) ==
6947                  LoopVectorizationCostModel::CM_Scalarize &&
6948              "Memory widening decisions should have been taken care by now");
6949       return false;
6950     }
6951     return true;
6952   };
6953 
6954   if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
6955     return false;
6956   // If this ingredient's recipe is to be recorded, keep its recipe a singleton
6957   // to avoid having to split recipes later.
6958   bool IsSingleton = Ingredient2Recipe.count(I);
6959 
6960   // Success: widen this instruction.
6961 
6962   // Use the default widening recipe. We optimize the common case where
6963   // consecutive instructions can be represented by a single recipe.
6964   if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
6965       LastExtensibleRecipe->appendInstruction(I))
6966     return true;
6967 
6968   VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
6969   if (!IsSingleton)
6970     LastExtensibleRecipe = WidenRecipe;
6971   setRecipe(I, WidenRecipe);
6972   VPBB->appendRecipe(WidenRecipe);
6973   return true;
6974 }
6975 
handleReplication(Instruction * I,VFRange & Range,VPBasicBlock * VPBB,DenseMap<Instruction *,VPReplicateRecipe * > & PredInst2Recipe,VPlanPtr & Plan)6976 VPBasicBlock *VPRecipeBuilder::handleReplication(
6977     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
6978     DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
6979     VPlanPtr &Plan) {
6980   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
6981       [&](unsigned VF) { return CM.isUniformAfterVectorization(I, VF); },
6982       Range);
6983 
6984   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
6985       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
6986 
6987   auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
6988   setRecipe(I, Recipe);
6989 
6990   // Find if I uses a predicated instruction. If so, it will use its scalar
6991   // value. Avoid hoisting the insert-element which packs the scalar value into
6992   // a vector value, as that happens iff all users use the vector value.
6993   for (auto &Op : I->operands())
6994     if (auto *PredInst = dyn_cast<Instruction>(Op))
6995       if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
6996         PredInst2Recipe[PredInst]->setAlsoPack(false);
6997 
6998   // Finalize the recipe for Instr, first if it is not predicated.
6999   if (!IsPredicated) {
7000     LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
7001     VPBB->appendRecipe(Recipe);
7002     return VPBB;
7003   }
7004   LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
7005   assert(VPBB->getSuccessors().empty() &&
7006          "VPBB has successors when handling predicated replication.");
7007   // Record predicated instructions for above packing optimizations.
7008   PredInst2Recipe[I] = Recipe;
7009   VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
7010   VPBlockUtils::insertBlockAfter(Region, VPBB);
7011   auto *RegSucc = new VPBasicBlock();
7012   VPBlockUtils::insertBlockAfter(RegSucc, Region);
7013   return RegSucc;
7014 }
7015 
createReplicateRegion(Instruction * Instr,VPRecipeBase * PredRecipe,VPlanPtr & Plan)7016 VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
7017                                                       VPRecipeBase *PredRecipe,
7018                                                       VPlanPtr &Plan) {
7019   // Instructions marked for predication are replicated and placed under an
7020   // if-then construct to prevent side-effects.
7021 
7022   // Generate recipes to compute the block mask for this region.
7023   VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
7024 
7025   // Build the triangular if-then region.
7026   std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
7027   assert(Instr->getParent() && "Predicated instruction not in any basic block");
7028   auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
7029   auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
7030   auto *PHIRecipe =
7031       Instr->getType()->isVoidTy() ? nullptr : new VPPredInstPHIRecipe(Instr);
7032   auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
7033   auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
7034   VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
7035 
7036   // Note: first set Entry as region entry and then connect successors starting
7037   // from it in order, to propagate the "parent" of each VPBasicBlock.
7038   VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
7039   VPBlockUtils::connectBlocks(Pred, Exit);
7040 
7041   return Region;
7042 }
7043 
tryToCreateRecipe(Instruction * Instr,VFRange & Range,VPlanPtr & Plan,VPBasicBlock * VPBB)7044 bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
7045                                         VPlanPtr &Plan, VPBasicBlock *VPBB) {
7046   VPRecipeBase *Recipe = nullptr;
7047 
7048   // First, check for specific widening recipes that deal with memory
7049   // operations, inductions and Phi nodes.
7050   if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
7051       (Recipe = tryToOptimizeInduction(Instr, Range)) ||
7052       (Recipe = tryToBlend(Instr, Plan)) ||
7053       (isa<PHINode>(Instr) &&
7054        (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
7055     setRecipe(Instr, Recipe);
7056     VPBB->appendRecipe(Recipe);
7057     return true;
7058   }
7059 
7060   // Handle GEP widening.
7061   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
7062     auto Scalarize = [&](unsigned VF) {
7063       return CM.isScalarWithPredication(Instr, VF) ||
7064              CM.isScalarAfterVectorization(Instr, VF) ||
7065              CM.isProfitableToScalarize(Instr, VF);
7066     };
7067     if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
7068       return false;
7069     VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
7070     setRecipe(Instr, Recipe);
7071     VPBB->appendRecipe(Recipe);
7072     return true;
7073   }
7074 
7075   // Check if Instr is to be widened by a general VPWidenRecipe, after
7076   // having first checked for specific widening recipes.
7077   if (tryToWiden(Instr, VPBB, Range))
7078     return true;
7079 
7080   return false;
7081 }
7082 
buildVPlansWithVPRecipes(unsigned MinVF,unsigned MaxVF)7083 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
7084                                                         unsigned MaxVF) {
7085   assert(OrigLoop->empty() && "Inner loop expected.");
7086 
7087   // Collect conditions feeding internal conditional branches; they need to be
7088   // represented in VPlan for it to model masking.
7089   SmallPtrSet<Value *, 1> NeedDef;
7090 
7091   auto *Latch = OrigLoop->getLoopLatch();
7092   for (BasicBlock *BB : OrigLoop->blocks()) {
7093     if (BB == Latch)
7094       continue;
7095     BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
7096     if (Branch && Branch->isConditional())
7097       NeedDef.insert(Branch->getCondition());
7098   }
7099 
7100   // If the tail is to be folded by masking, the primary induction variable
7101   // needs to be represented in VPlan for it to model early-exit masking.
7102   // Also, both the Phi and the live-out instruction of each reduction are
7103   // required in order to introduce a select between them in VPlan.
7104   if (CM.foldTailByMasking()) {
7105     NeedDef.insert(Legal->getPrimaryInduction());
7106     for (auto &Reduction : *Legal->getReductionVars()) {
7107       NeedDef.insert(Reduction.first);
7108       NeedDef.insert(Reduction.second.getLoopExitInstr());
7109     }
7110   }
7111 
7112   // Collect instructions from the original loop that will become trivially dead
7113   // in the vectorized loop. We don't need to vectorize these instructions. For
7114   // example, original induction update instructions can become dead because we
7115   // separately emit induction "steps" when generating code for the new loop.
7116   // Similarly, we create a new latch condition when setting up the structure
7117   // of the new loop, so the old one can become dead.
7118   SmallPtrSet<Instruction *, 4> DeadInstructions;
7119   collectTriviallyDeadInstructions(DeadInstructions);
7120 
7121   DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
7122   // Dead instructions do not need sinking. Remove them from SinkAfter.
7123   for (Instruction *I : DeadInstructions)
7124     SinkAfter.erase(I);
7125 
7126   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
7127     VFRange SubRange = {VF, MaxVF + 1};
7128     VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
7129                                              DeadInstructions, SinkAfter));
7130     VF = SubRange.End;
7131   }
7132 }
7133 
buildVPlanWithVPRecipes(VFRange & Range,SmallPtrSetImpl<Value * > & NeedDef,SmallPtrSetImpl<Instruction * > & DeadInstructions,const DenseMap<Instruction *,Instruction * > & SinkAfter)7134 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
7135     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
7136     SmallPtrSetImpl<Instruction *> &DeadInstructions,
7137     const DenseMap<Instruction *, Instruction *> &SinkAfter) {
7138 
7139   // Hold a mapping from predicated instructions to their recipes, in order to
7140   // fix their AlsoPack behavior if a user is determined to replicate and use a
7141   // scalar instead of vector value.
7142   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
7143 
7144   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
7145 
7146   VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
7147 
7148   // ---------------------------------------------------------------------------
7149   // Pre-construction: record ingredients whose recipes we'll need to further
7150   // process after constructing the initial VPlan.
7151   // ---------------------------------------------------------------------------
7152 
7153   // Mark instructions we'll need to sink later and their targets as
7154   // ingredients whose recipe we'll need to record.
7155   for (auto &Entry : SinkAfter) {
7156     RecipeBuilder.recordRecipeOf(Entry.first);
7157     RecipeBuilder.recordRecipeOf(Entry.second);
7158   }
7159 
7160   // For each interleave group which is relevant for this (possibly trimmed)
7161   // Range, add it to the set of groups to be later applied to the VPlan and add
7162   // placeholders for its members' Recipes which we'll be replacing with a
7163   // single VPInterleaveRecipe.
7164   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
7165     auto applyIG = [IG, this](unsigned VF) -> bool {
7166       return (VF >= 2 && // Query is illegal for VF == 1
7167               CM.getWideningDecision(IG->getInsertPos(), VF) ==
7168                   LoopVectorizationCostModel::CM_Interleave);
7169     };
7170     if (!getDecisionAndClampRange(applyIG, Range))
7171       continue;
7172     InterleaveGroups.insert(IG);
7173     for (unsigned i = 0; i < IG->getFactor(); i++)
7174       if (Instruction *Member = IG->getMember(i))
7175         RecipeBuilder.recordRecipeOf(Member);
7176   };
7177 
7178   // ---------------------------------------------------------------------------
7179   // Build initial VPlan: Scan the body of the loop in a topological order to
7180   // visit each basic block after having visited its predecessor basic blocks.
7181   // ---------------------------------------------------------------------------
7182 
7183   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
7184   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
7185   auto Plan = std::make_unique<VPlan>(VPBB);
7186 
7187   // Represent values that will have defs inside VPlan.
7188   for (Value *V : NeedDef)
7189     Plan->addVPValue(V);
7190 
7191   // Scan the body of the loop in a topological order to visit each basic block
7192   // after having visited its predecessor basic blocks.
7193   LoopBlocksDFS DFS(OrigLoop);
7194   DFS.perform(LI);
7195 
7196   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
7197     // Relevant instructions from basic block BB will be grouped into VPRecipe
7198     // ingredients and fill a new VPBasicBlock.
7199     unsigned VPBBsForBB = 0;
7200     auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
7201     VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
7202     VPBB = FirstVPBBForBB;
7203     Builder.setInsertPoint(VPBB);
7204 
7205     // Introduce each ingredient into VPlan.
7206     for (Instruction &I : BB->instructionsWithoutDebug()) {
7207       Instruction *Instr = &I;
7208 
7209       // First filter out irrelevant instructions, to ensure no recipes are
7210       // built for them.
7211       if (isa<BranchInst>(Instr) ||
7212           DeadInstructions.find(Instr) != DeadInstructions.end())
7213         continue;
7214 
7215       if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
7216         continue;
7217 
7218       // Otherwise, if all widening options failed, Instruction is to be
7219       // replicated. This may create a successor for VPBB.
7220       VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
7221           Instr, Range, VPBB, PredInst2Recipe, Plan);
7222       if (NextVPBB != VPBB) {
7223         VPBB = NextVPBB;
7224         VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
7225                                     : "");
7226       }
7227     }
7228   }
7229 
7230   // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
7231   // may also be empty, such as the last one VPBB, reflecting original
7232   // basic-blocks with no recipes.
7233   VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
7234   assert(PreEntry->empty() && "Expecting empty pre-entry block.");
7235   VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
7236   VPBlockUtils::disconnectBlocks(PreEntry, Entry);
7237   delete PreEntry;
7238 
7239   // ---------------------------------------------------------------------------
7240   // Transform initial VPlan: Apply previously taken decisions, in order, to
7241   // bring the VPlan to its final state.
7242   // ---------------------------------------------------------------------------
7243 
7244   // Apply Sink-After legal constraints.
7245   for (auto &Entry : SinkAfter) {
7246     VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
7247     VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
7248     Sink->moveAfter(Target);
7249   }
7250 
7251   // Interleave memory: for each Interleave Group we marked earlier as relevant
7252   // for this VPlan, replace the Recipes widening its memory instructions with a
7253   // single VPInterleaveRecipe at its insertion point.
7254   for (auto IG : InterleaveGroups) {
7255     auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
7256         RecipeBuilder.getRecipe(IG->getInsertPos()));
7257     (new VPInterleaveRecipe(IG, Recipe->getAddr(), Recipe->getMask()))
7258         ->insertBefore(Recipe);
7259 
7260     for (unsigned i = 0; i < IG->getFactor(); ++i)
7261       if (Instruction *Member = IG->getMember(i)) {
7262         RecipeBuilder.getRecipe(Member)->eraseFromParent();
7263       }
7264   }
7265 
7266   // Finally, if tail is folded by masking, introduce selects between the phi
7267   // and the live-out instruction of each reduction, at the end of the latch.
7268   if (CM.foldTailByMasking()) {
7269     Builder.setInsertPoint(VPBB);
7270     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
7271     for (auto &Reduction : *Legal->getReductionVars()) {
7272       VPValue *Phi = Plan->getVPValue(Reduction.first);
7273       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
7274       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
7275     }
7276   }
7277 
7278   std::string PlanName;
7279   raw_string_ostream RSO(PlanName);
7280   unsigned VF = Range.Start;
7281   Plan->addVF(VF);
7282   RSO << "Initial VPlan for VF={" << VF;
7283   for (VF *= 2; VF < Range.End; VF *= 2) {
7284     Plan->addVF(VF);
7285     RSO << "," << VF;
7286   }
7287   RSO << "},UF>=1";
7288   RSO.flush();
7289   Plan->setName(PlanName);
7290 
7291   return Plan;
7292 }
7293 
buildVPlan(VFRange & Range)7294 VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
7295   // Outer loop handling: They may require CFG and instruction level
7296   // transformations before even evaluating whether vectorization is profitable.
7297   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
7298   // the vectorization pipeline.
7299   assert(!OrigLoop->empty());
7300   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
7301 
7302   // Create new empty VPlan
7303   auto Plan = std::make_unique<VPlan>();
7304 
7305   // Build hierarchical CFG
7306   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
7307   HCFGBuilder.buildHierarchicalCFG();
7308 
7309   for (unsigned VF = Range.Start; VF < Range.End; VF *= 2)
7310     Plan->addVF(VF);
7311 
7312   if (EnableVPlanPredication) {
7313     VPlanPredicator VPP(*Plan);
7314     VPP.predicate();
7315 
7316     // Avoid running transformation to recipes until masked code generation in
7317     // VPlan-native path is in place.
7318     return Plan;
7319   }
7320 
7321   SmallPtrSet<Instruction *, 1> DeadInstructions;
7322   VPlanTransforms::VPInstructionsToVPRecipes(
7323       OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
7324   return Plan;
7325 }
7326 
7327 Value* LoopVectorizationPlanner::VPCallbackILV::
getOrCreateVectorValues(Value * V,unsigned Part)7328 getOrCreateVectorValues(Value *V, unsigned Part) {
7329       return ILV.getOrCreateVectorValue(V, Part);
7330 }
7331 
getOrCreateScalarValue(Value * V,const VPIteration & Instance)7332 Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
7333     Value *V, const VPIteration &Instance) {
7334   return ILV.getOrCreateScalarValue(V, Instance);
7335 }
7336 
print(raw_ostream & O,const Twine & Indent) const7337 void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
7338   O << " +\n"
7339     << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
7340   IG->getInsertPos()->printAsOperand(O, false);
7341   O << ", ";
7342   getAddr()->printAsOperand(O);
7343   VPValue *Mask = getMask();
7344   if (Mask) {
7345     O << ", ";
7346     Mask->printAsOperand(O);
7347   }
7348   O << "\\l\"";
7349   for (unsigned i = 0; i < IG->getFactor(); ++i)
7350     if (Instruction *I = IG->getMember(i))
7351       O << " +\n"
7352         << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
7353 }
7354 
execute(VPTransformState & State)7355 void VPWidenRecipe::execute(VPTransformState &State) {
7356   for (auto &Instr : make_range(Begin, End))
7357     State.ILV->widenInstruction(Instr);
7358 }
7359 
execute(VPTransformState & State)7360 void VPWidenGEPRecipe::execute(VPTransformState &State) {
7361   State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
7362                       IsIndexLoopInvariant);
7363 }
7364 
execute(VPTransformState & State)7365 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
7366   assert(!State.Instance && "Int or FP induction being replicated.");
7367   State.ILV->widenIntOrFpInduction(IV, Trunc);
7368 }
7369 
execute(VPTransformState & State)7370 void VPWidenPHIRecipe::execute(VPTransformState &State) {
7371   State.ILV->widenPHIInstruction(Phi, State.UF, State.VF);
7372 }
7373 
execute(VPTransformState & State)7374 void VPBlendRecipe::execute(VPTransformState &State) {
7375   State.ILV->setDebugLocFromInst(State.Builder, Phi);
7376   // We know that all PHIs in non-header blocks are converted into
7377   // selects, so we don't have to worry about the insertion order and we
7378   // can just use the builder.
7379   // At this point we generate the predication tree. There may be
7380   // duplications since this is a simple recursive scan, but future
7381   // optimizations will clean it up.
7382 
7383   unsigned NumIncoming = Phi->getNumIncomingValues();
7384 
7385   assert((User || NumIncoming == 1) &&
7386          "Multiple predecessors with predecessors having a full mask");
7387   // Generate a sequence of selects of the form:
7388   // SELECT(Mask3, In3,
7389   //      SELECT(Mask2, In2,
7390   //                   ( ...)))
7391   InnerLoopVectorizer::VectorParts Entry(State.UF);
7392   for (unsigned In = 0; In < NumIncoming; ++In) {
7393     for (unsigned Part = 0; Part < State.UF; ++Part) {
7394       // We might have single edge PHIs (blocks) - use an identity
7395       // 'select' for the first PHI operand.
7396       Value *In0 =
7397           State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
7398       if (In == 0)
7399         Entry[Part] = In0; // Initialize with the first incoming value.
7400       else {
7401         // Select between the current value and the previous incoming edge
7402         // based on the incoming mask.
7403         Value *Cond = State.get(User->getOperand(In), Part);
7404         Entry[Part] =
7405             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
7406       }
7407     }
7408   }
7409   for (unsigned Part = 0; Part < State.UF; ++Part)
7410     State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
7411 }
7412 
execute(VPTransformState & State)7413 void VPInterleaveRecipe::execute(VPTransformState &State) {
7414   assert(!State.Instance && "Interleave group being replicated.");
7415   State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
7416                                       getMask());
7417 }
7418 
execute(VPTransformState & State)7419 void VPReplicateRecipe::execute(VPTransformState &State) {
7420   if (State.Instance) { // Generate a single instance.
7421     State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
7422     // Insert scalar instance packing it into a vector.
7423     if (AlsoPack && State.VF > 1) {
7424       // If we're constructing lane 0, initialize to start from undef.
7425       if (State.Instance->Lane == 0) {
7426         Value *Undef =
7427             UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
7428         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
7429       }
7430       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
7431     }
7432     return;
7433   }
7434 
7435   // Generate scalar instances for all VF lanes of all UF parts, unless the
7436   // instruction is uniform inwhich case generate only the first lane for each
7437   // of the UF parts.
7438   unsigned EndLane = IsUniform ? 1 : State.VF;
7439   for (unsigned Part = 0; Part < State.UF; ++Part)
7440     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
7441       State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
7442 }
7443 
execute(VPTransformState & State)7444 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
7445   assert(State.Instance && "Branch on Mask works only on single instance.");
7446 
7447   unsigned Part = State.Instance->Part;
7448   unsigned Lane = State.Instance->Lane;
7449 
7450   Value *ConditionBit = nullptr;
7451   if (!User) // Block in mask is all-one.
7452     ConditionBit = State.Builder.getTrue();
7453   else {
7454     VPValue *BlockInMask = User->getOperand(0);
7455     ConditionBit = State.get(BlockInMask, Part);
7456     if (ConditionBit->getType()->isVectorTy())
7457       ConditionBit = State.Builder.CreateExtractElement(
7458           ConditionBit, State.Builder.getInt32(Lane));
7459   }
7460 
7461   // Replace the temporary unreachable terminator with a new conditional branch,
7462   // whose two destinations will be set later when they are created.
7463   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
7464   assert(isa<UnreachableInst>(CurrentTerminator) &&
7465          "Expected to replace unreachable terminator with conditional branch.");
7466   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
7467   CondBr->setSuccessor(0, nullptr);
7468   ReplaceInstWithInst(CurrentTerminator, CondBr);
7469 }
7470 
execute(VPTransformState & State)7471 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
7472   assert(State.Instance && "Predicated instruction PHI works per instance.");
7473   Instruction *ScalarPredInst = cast<Instruction>(
7474       State.ValueMap.getScalarValue(PredInst, *State.Instance));
7475   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
7476   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
7477   assert(PredicatingBB && "Predicated block has no single predecessor.");
7478 
7479   // By current pack/unpack logic we need to generate only a single phi node: if
7480   // a vector value for the predicated instruction exists at this point it means
7481   // the instruction has vector users only, and a phi for the vector value is
7482   // needed. In this case the recipe of the predicated instruction is marked to
7483   // also do that packing, thereby "hoisting" the insert-element sequence.
7484   // Otherwise, a phi node for the scalar value is needed.
7485   unsigned Part = State.Instance->Part;
7486   if (State.ValueMap.hasVectorValue(PredInst, Part)) {
7487     Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
7488     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
7489     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
7490     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
7491     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
7492     State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
7493   } else {
7494     Type *PredInstType = PredInst->getType();
7495     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
7496     Phi->addIncoming(UndefValue::get(ScalarPredInst->getType()), PredicatingBB);
7497     Phi->addIncoming(ScalarPredInst, PredicatedBB);
7498     State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
7499   }
7500 }
7501 
execute(VPTransformState & State)7502 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
7503   State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
7504 }
7505 
7506 // Determine how to lower the scalar epilogue, which depends on 1) optimising
7507 // for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
7508 // predication, and 4) a TTI hook that analyses whether the loop is suitable
7509 // for predication.
getScalarEpilogueLowering(Function * F,Loop * L,LoopVectorizeHints & Hints,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,AssumptionCache * AC,LoopInfo * LI,ScalarEvolution * SE,DominatorTree * DT,LoopVectorizationLegality & LVL)7510 static ScalarEpilogueLowering getScalarEpilogueLowering(
7511     Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
7512     BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
7513     AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
7514     LoopVectorizationLegality &LVL) {
7515   bool OptSize =
7516       F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
7517                                                      PGSOQueryType::IRPass);
7518   // 1) OptSize takes precedence over all other options, i.e. if this is set,
7519   // don't look at hints or options, and don't request a scalar epilogue.
7520   if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
7521     return CM_ScalarEpilogueNotAllowedOptSize;
7522 
7523   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
7524                               !PreferPredicateOverEpilog;
7525 
7526   // 2) Next, if disabling predication is requested on the command line, honour
7527   // this and request a scalar epilogue. Also do this if we don't have a
7528   // primary induction variable, which is required for predication.
7529   if (PredicateOptDisabled || !LVL.getPrimaryInduction())
7530     return CM_ScalarEpilogueAllowed;
7531 
7532   // 3) and 4) look if enabling predication is requested on the command line,
7533   // with a loop hint, or if the TTI hook indicates this is profitable, request
7534   // predication .
7535   if (PreferPredicateOverEpilog ||
7536       Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
7537       (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
7538                                         LVL.getLAI()) &&
7539        Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
7540     return CM_ScalarEpilogueNotNeededUsePredicate;
7541 
7542   return CM_ScalarEpilogueAllowed;
7543 }
7544 
7545 // Process the loop in the VPlan-native vectorization path. This path builds
7546 // VPlan upfront in the vectorization pipeline, which allows to apply
7547 // VPlan-to-VPlan transformations from the very beginning without modifying the
7548 // input LLVM IR.
processLoopInVPlanNativePath(Loop * L,PredicatedScalarEvolution & PSE,LoopInfo * LI,DominatorTree * DT,LoopVectorizationLegality * LVL,TargetTransformInfo * TTI,TargetLibraryInfo * TLI,DemandedBits * DB,AssumptionCache * AC,OptimizationRemarkEmitter * ORE,BlockFrequencyInfo * BFI,ProfileSummaryInfo * PSI,LoopVectorizeHints & Hints)7549 static bool processLoopInVPlanNativePath(
7550     Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
7551     LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
7552     TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
7553     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
7554     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
7555 
7556   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
7557   Function *F = L->getHeader()->getParent();
7558   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
7559 
7560   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7561       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
7562 
7563   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7564                                 &Hints, IAI);
7565   // Use the planner for outer loop vectorization.
7566   // TODO: CM is not used at this point inside the planner. Turn CM into an
7567   // optional argument if we don't need it in the future.
7568   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
7569 
7570   // Get user vectorization factor.
7571   const unsigned UserVF = Hints.getWidth();
7572 
7573   // Plan how to best vectorize, return the best VF and its cost.
7574   const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
7575 
7576   // If we are stress testing VPlan builds, do not attempt to generate vector
7577   // code. Masked vector code generation support will follow soon.
7578   // Also, do not attempt to vectorize if no vector code will be produced.
7579   if (VPlanBuildStressTest || EnableVPlanPredication ||
7580       VectorizationFactor::Disabled() == VF)
7581     return false;
7582 
7583   LVP.setBestPlan(VF.Width, 1);
7584 
7585   InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
7586                          &CM);
7587   LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
7588                     << L->getHeader()->getParent()->getName() << "\"\n");
7589   LVP.executePlan(LB, DT);
7590 
7591   // Mark the loop as already vectorized to avoid vectorizing again.
7592   Hints.setAlreadyVectorized();
7593 
7594   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7595   return true;
7596 }
7597 
processLoop(Loop * L)7598 bool LoopVectorizePass::processLoop(Loop *L) {
7599   assert((EnableVPlanNativePath || L->empty()) &&
7600          "VPlan-native path is not enabled. Only process inner loops.");
7601 
7602 #ifndef NDEBUG
7603   const std::string DebugLocStr = getDebugLocString(L);
7604 #endif /* NDEBUG */
7605 
7606   LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
7607                     << L->getHeader()->getParent()->getName() << "\" from "
7608                     << DebugLocStr << "\n");
7609 
7610   LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
7611 
7612   LLVM_DEBUG(
7613       dbgs() << "LV: Loop hints:"
7614              << " force="
7615              << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
7616                      ? "disabled"
7617                      : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
7618                             ? "enabled"
7619                             : "?"))
7620              << " width=" << Hints.getWidth()
7621              << " unroll=" << Hints.getInterleave() << "\n");
7622 
7623   // Function containing loop
7624   Function *F = L->getHeader()->getParent();
7625 
7626   // Looking at the diagnostic output is the only way to determine if a loop
7627   // was vectorized (other than looking at the IR or machine code), so it
7628   // is important to generate an optimization remark for each loop. Most of
7629   // these messages are generated as OptimizationRemarkAnalysis. Remarks
7630   // generated as OptimizationRemark and OptimizationRemarkMissed are
7631   // less verbose reporting vectorized loops and unvectorized loops that may
7632   // benefit from vectorization, respectively.
7633 
7634   if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
7635     LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
7636     return false;
7637   }
7638 
7639   PredicatedScalarEvolution PSE(*SE, *L);
7640 
7641   // Check if it is legal to vectorize the loop.
7642   LoopVectorizationRequirements Requirements(*ORE);
7643   LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
7644                                 &Requirements, &Hints, DB, AC);
7645   if (!LVL.canVectorize(EnableVPlanNativePath)) {
7646     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
7647     Hints.emitRemarkWithHints();
7648     return false;
7649   }
7650 
7651   // Check the function attributes and profiles to find out if this function
7652   // should be optimized for size.
7653   ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
7654       F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
7655 
7656   // Entrance to the VPlan-native vectorization path. Outer loops are processed
7657   // here. They may require CFG and instruction level transformations before
7658   // even evaluating whether vectorization is profitable. Since we cannot modify
7659   // the incoming IR, we need to build VPlan upfront in the vectorization
7660   // pipeline.
7661   if (!L->empty())
7662     return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
7663                                         ORE, BFI, PSI, Hints);
7664 
7665   assert(L->empty() && "Inner loop expected.");
7666 
7667   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
7668   // count by optimizing for size, to minimize overheads.
7669   auto ExpectedTC = getSmallBestKnownTC(*SE, L);
7670   if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
7671     LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
7672                       << "This loop is worth vectorizing only if no scalar "
7673                       << "iteration overheads are incurred.");
7674     if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
7675       LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
7676     else {
7677       LLVM_DEBUG(dbgs() << "\n");
7678       SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
7679     }
7680   }
7681 
7682   // Check the function attributes to see if implicit floats are allowed.
7683   // FIXME: This check doesn't seem possibly correct -- what if the loop is
7684   // an integer loop and the vector instructions selected are purely integer
7685   // vector instructions?
7686   if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
7687     reportVectorizationFailure(
7688         "Can't vectorize when the NoImplicitFloat attribute is used",
7689         "loop not vectorized due to NoImplicitFloat attribute",
7690         "NoImplicitFloat", ORE, L);
7691     Hints.emitRemarkWithHints();
7692     return false;
7693   }
7694 
7695   // Check if the target supports potentially unsafe FP vectorization.
7696   // FIXME: Add a check for the type of safety issue (denormal, signaling)
7697   // for the target we're vectorizing for, to make sure none of the
7698   // additional fp-math flags can help.
7699   if (Hints.isPotentiallyUnsafe() &&
7700       TTI->isFPVectorizationPotentiallyUnsafe()) {
7701     reportVectorizationFailure(
7702         "Potentially unsafe FP op prevents vectorization",
7703         "loop not vectorized due to unsafe FP support.",
7704         "UnsafeFP", ORE, L);
7705     Hints.emitRemarkWithHints();
7706     return false;
7707   }
7708 
7709   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
7710   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
7711 
7712   // If an override option has been passed in for interleaved accesses, use it.
7713   if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
7714     UseInterleaved = EnableInterleavedMemAccesses;
7715 
7716   // Analyze interleaved memory accesses.
7717   if (UseInterleaved) {
7718     IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
7719   }
7720 
7721   // Use the cost model.
7722   LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
7723                                 F, &Hints, IAI);
7724   CM.collectValuesToIgnore();
7725 
7726   // Use the planner for vectorization.
7727   LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
7728 
7729   // Get user vectorization factor.
7730   unsigned UserVF = Hints.getWidth();
7731 
7732   // Plan how to best vectorize, return the best VF and its cost.
7733   Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
7734 
7735   VectorizationFactor VF = VectorizationFactor::Disabled();
7736   unsigned IC = 1;
7737   unsigned UserIC = Hints.getInterleave();
7738 
7739   if (MaybeVF) {
7740     VF = *MaybeVF;
7741     // Select the interleave count.
7742     IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
7743   }
7744 
7745   // Identify the diagnostic messages that should be produced.
7746   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
7747   bool VectorizeLoop = true, InterleaveLoop = true;
7748   if (Requirements.doesNotMeet(F, L, Hints)) {
7749     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
7750                          "requirements.\n");
7751     Hints.emitRemarkWithHints();
7752     return false;
7753   }
7754 
7755   if (VF.Width == 1) {
7756     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
7757     VecDiagMsg = std::make_pair(
7758         "VectorizationNotBeneficial",
7759         "the cost-model indicates that vectorization is not beneficial");
7760     VectorizeLoop = false;
7761   }
7762 
7763   if (!MaybeVF && UserIC > 1) {
7764     // Tell the user interleaving was avoided up-front, despite being explicitly
7765     // requested.
7766     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
7767                          "interleaving should be avoided up front\n");
7768     IntDiagMsg = std::make_pair(
7769         "InterleavingAvoided",
7770         "Ignoring UserIC, because interleaving was avoided up front");
7771     InterleaveLoop = false;
7772   } else if (IC == 1 && UserIC <= 1) {
7773     // Tell the user interleaving is not beneficial.
7774     LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
7775     IntDiagMsg = std::make_pair(
7776         "InterleavingNotBeneficial",
7777         "the cost-model indicates that interleaving is not beneficial");
7778     InterleaveLoop = false;
7779     if (UserIC == 1) {
7780       IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
7781       IntDiagMsg.second +=
7782           " and is explicitly disabled or interleave count is set to 1";
7783     }
7784   } else if (IC > 1 && UserIC == 1) {
7785     // Tell the user interleaving is beneficial, but it explicitly disabled.
7786     LLVM_DEBUG(
7787         dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
7788     IntDiagMsg = std::make_pair(
7789         "InterleavingBeneficialButDisabled",
7790         "the cost-model indicates that interleaving is beneficial "
7791         "but is explicitly disabled or interleave count is set to 1");
7792     InterleaveLoop = false;
7793   }
7794 
7795   // Override IC if user provided an interleave count.
7796   IC = UserIC > 0 ? UserIC : IC;
7797 
7798   // Emit diagnostic messages, if any.
7799   const char *VAPassName = Hints.vectorizeAnalysisPassName();
7800   if (!VectorizeLoop && !InterleaveLoop) {
7801     // Do not vectorize or interleaving the loop.
7802     ORE->emit([&]() {
7803       return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
7804                                       L->getStartLoc(), L->getHeader())
7805              << VecDiagMsg.second;
7806     });
7807     ORE->emit([&]() {
7808       return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
7809                                       L->getStartLoc(), L->getHeader())
7810              << IntDiagMsg.second;
7811     });
7812     return false;
7813   } else if (!VectorizeLoop && InterleaveLoop) {
7814     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7815     ORE->emit([&]() {
7816       return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
7817                                         L->getStartLoc(), L->getHeader())
7818              << VecDiagMsg.second;
7819     });
7820   } else if (VectorizeLoop && !InterleaveLoop) {
7821     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7822                       << ") in " << DebugLocStr << '\n');
7823     ORE->emit([&]() {
7824       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
7825                                         L->getStartLoc(), L->getHeader())
7826              << IntDiagMsg.second;
7827     });
7828   } else if (VectorizeLoop && InterleaveLoop) {
7829     LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
7830                       << ") in " << DebugLocStr << '\n');
7831     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
7832   }
7833 
7834   LVP.setBestPlan(VF.Width, IC);
7835 
7836   using namespace ore;
7837   bool DisableRuntimeUnroll = false;
7838   MDNode *OrigLoopID = L->getLoopID();
7839 
7840   if (!VectorizeLoop) {
7841     assert(IC > 1 && "interleave count should not be 1 or 0");
7842     // If we decided that it is not legal to vectorize the loop, then
7843     // interleave it.
7844     InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
7845                                &CM);
7846     LVP.executePlan(Unroller, DT);
7847 
7848     ORE->emit([&]() {
7849       return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
7850                                 L->getHeader())
7851              << "interleaved loop (interleaved count: "
7852              << NV("InterleaveCount", IC) << ")";
7853     });
7854   } else {
7855     // If we decided that it is *legal* to vectorize the loop, then do it.
7856     InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC,
7857                            &LVL, &CM);
7858     LVP.executePlan(LB, DT);
7859     ++LoopsVectorized;
7860 
7861     // Add metadata to disable runtime unrolling a scalar loop when there are
7862     // no runtime checks about strides and memory. A scalar loop that is
7863     // rarely used is not worth unrolling.
7864     if (!LB.areSafetyChecksAdded())
7865       DisableRuntimeUnroll = true;
7866 
7867     // Report the vectorization decision.
7868     ORE->emit([&]() {
7869       return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
7870                                 L->getHeader())
7871              << "vectorized loop (vectorization width: "
7872              << NV("VectorizationFactor", VF.Width)
7873              << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
7874     });
7875   }
7876 
7877   Optional<MDNode *> RemainderLoopID =
7878       makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
7879                                       LLVMLoopVectorizeFollowupEpilogue});
7880   if (RemainderLoopID.hasValue()) {
7881     L->setLoopID(RemainderLoopID.getValue());
7882   } else {
7883     if (DisableRuntimeUnroll)
7884       AddRuntimeUnrollDisableMetaData(L);
7885 
7886     // Mark the loop as already vectorized to avoid vectorizing again.
7887     Hints.setAlreadyVectorized();
7888   }
7889 
7890   LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
7891   return true;
7892 }
7893 
runImpl(Function & F,ScalarEvolution & SE_,LoopInfo & LI_,TargetTransformInfo & TTI_,DominatorTree & DT_,BlockFrequencyInfo & BFI_,TargetLibraryInfo * TLI_,DemandedBits & DB_,AliasAnalysis & AA_,AssumptionCache & AC_,std::function<const LoopAccessInfo & (Loop &)> & GetLAA_,OptimizationRemarkEmitter & ORE_,ProfileSummaryInfo * PSI_)7894 bool LoopVectorizePass::runImpl(
7895     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
7896     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
7897     DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
7898     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
7899     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
7900   SE = &SE_;
7901   LI = &LI_;
7902   TTI = &TTI_;
7903   DT = &DT_;
7904   BFI = &BFI_;
7905   TLI = TLI_;
7906   AA = &AA_;
7907   AC = &AC_;
7908   GetLAA = &GetLAA_;
7909   DB = &DB_;
7910   ORE = &ORE_;
7911   PSI = PSI_;
7912 
7913   // Don't attempt if
7914   // 1. the target claims to have no vector registers, and
7915   // 2. interleaving won't help ILP.
7916   //
7917   // The second condition is necessary because, even if the target has no
7918   // vector registers, loop vectorization may still enable scalar
7919   // interleaving.
7920   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
7921       TTI->getMaxInterleaveFactor(1) < 2)
7922     return false;
7923 
7924   bool Changed = false;
7925 
7926   // The vectorizer requires loops to be in simplified form.
7927   // Since simplification may add new inner loops, it has to run before the
7928   // legality and profitability checks. This means running the loop vectorizer
7929   // will simplify all loops, regardless of whether anything end up being
7930   // vectorized.
7931   for (auto &L : *LI)
7932     Changed |=
7933         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
7934 
7935   // Build up a worklist of inner-loops to vectorize. This is necessary as
7936   // the act of vectorizing or partially unrolling a loop creates new loops
7937   // and can invalidate iterators across the loops.
7938   SmallVector<Loop *, 8> Worklist;
7939 
7940   for (Loop *L : *LI)
7941     collectSupportedLoops(*L, LI, ORE, Worklist);
7942 
7943   LoopsAnalyzed += Worklist.size();
7944 
7945   // Now walk the identified inner loops.
7946   while (!Worklist.empty()) {
7947     Loop *L = Worklist.pop_back_val();
7948 
7949     // For the inner loops we actually process, form LCSSA to simplify the
7950     // transform.
7951     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
7952 
7953     Changed |= processLoop(L);
7954   }
7955 
7956   // Process each loop nest in the function.
7957   return Changed;
7958 }
7959 
run(Function & F,FunctionAnalysisManager & AM)7960 PreservedAnalyses LoopVectorizePass::run(Function &F,
7961                                          FunctionAnalysisManager &AM) {
7962     auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
7963     auto &LI = AM.getResult<LoopAnalysis>(F);
7964     auto &TTI = AM.getResult<TargetIRAnalysis>(F);
7965     auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
7966     auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
7967     auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
7968     auto &AA = AM.getResult<AAManager>(F);
7969     auto &AC = AM.getResult<AssumptionAnalysis>(F);
7970     auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
7971     auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
7972     MemorySSA *MSSA = EnableMSSALoopDependency
7973                           ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
7974                           : nullptr;
7975 
7976     auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
7977     std::function<const LoopAccessInfo &(Loop &)> GetLAA =
7978         [&](Loop &L) -> const LoopAccessInfo & {
7979       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
7980       return LAM.getResult<LoopAccessAnalysis>(L, AR);
7981     };
7982     const ModuleAnalysisManager &MAM =
7983         AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
7984     ProfileSummaryInfo *PSI =
7985         MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
7986     bool Changed =
7987         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
7988     if (!Changed)
7989       return PreservedAnalyses::all();
7990     PreservedAnalyses PA;
7991 
7992     // We currently do not preserve loopinfo/dominator analyses with outer loop
7993     // vectorization. Until this is addressed, mark these analyses as preserved
7994     // only for non-VPlan-native path.
7995     // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
7996     if (!EnableVPlanNativePath) {
7997       PA.preserve<LoopAnalysis>();
7998       PA.preserve<DominatorTreeAnalysis>();
7999     }
8000     PA.preserve<BasicAA>();
8001     PA.preserve<GlobalsAA>();
8002     return PA;
8003 }
8004