1 //===- LoopVectorizationLegality.cpp --------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file provides loop vectorization legality analysis. Original code
10 // resided in LoopVectorize.cpp for a long time.
11 //
12 // At this point, it is implemented as a utility class, not as an analysis
13 // pass. It should be easy to create an analysis pass around it if there
14 // is a need (but D45420 needs to happen first).
15 //
16 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
17 #include "llvm/Analysis/Loads.h"
18 #include "llvm/Analysis/LoopInfo.h"
19 #include "llvm/Analysis/ValueTracking.h"
20 #include "llvm/Analysis/VectorUtils.h"
21 #include "llvm/IR/IntrinsicInst.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
24 
25 using namespace llvm;
26 using namespace PatternMatch;
27 
28 #define LV_NAME "loop-vectorize"
29 #define DEBUG_TYPE LV_NAME
30 
31 extern cl::opt<bool> EnableVPlanPredication;
32 
33 static cl::opt<bool>
34     EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
35                        cl::desc("Enable if-conversion during vectorization."));
36 
37 static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
38     "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
39     cl::desc("The maximum allowed number of runtime memory checks with a "
40              "vectorize(enable) pragma."));
41 
42 static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
43     "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
44     cl::desc("The maximum number of SCEV checks allowed."));
45 
46 static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
47     "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
48     cl::desc("The maximum number of SCEV checks allowed with a "
49              "vectorize(enable) pragma"));
50 
51 /// Maximum vectorization interleave count.
52 static const unsigned MaxInterleaveFactor = 16;
53 
54 namespace llvm {
55 
validate(unsigned Val)56 bool LoopVectorizeHints::Hint::validate(unsigned Val) {
57   switch (Kind) {
58   case HK_WIDTH:
59     return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
60   case HK_UNROLL:
61     return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
62   case HK_FORCE:
63     return (Val <= 1);
64   case HK_ISVECTORIZED:
65   case HK_PREDICATE:
66     return (Val == 0 || Val == 1);
67   }
68   return false;
69 }
70 
LoopVectorizeHints(const Loop * L,bool InterleaveOnlyWhenForced,OptimizationRemarkEmitter & ORE)71 LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
72                                        bool InterleaveOnlyWhenForced,
73                                        OptimizationRemarkEmitter &ORE)
74     : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
75       Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
76       Force("vectorize.enable", FK_Undefined, HK_FORCE),
77       IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
78       Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),
79       ORE(ORE) {
80   // Populate values with existing loop metadata.
81   getHintsFromMetadata();
82 
83   // force-vector-interleave overrides DisableInterleaving.
84   if (VectorizerParams::isInterleaveForced())
85     Interleave.Value = VectorizerParams::VectorizationInterleave;
86 
87   if (IsVectorized.Value != 1)
88     // If the vectorization width and interleaving count are both 1 then
89     // consider the loop to have been already vectorized because there's
90     // nothing more that we can do.
91     IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
92   LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
93              << "LV: Interleaving disabled by the pass manager\n");
94 }
95 
setAlreadyVectorized()96 void LoopVectorizeHints::setAlreadyVectorized() {
97   LLVMContext &Context = TheLoop->getHeader()->getContext();
98 
99   MDNode *IsVectorizedMD = MDNode::get(
100       Context,
101       {MDString::get(Context, "llvm.loop.isvectorized"),
102        ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
103   MDNode *LoopID = TheLoop->getLoopID();
104   MDNode *NewLoopID =
105       makePostTransformationMetadata(Context, LoopID,
106                                      {Twine(Prefix(), "vectorize.").str(),
107                                       Twine(Prefix(), "interleave.").str()},
108                                      {IsVectorizedMD});
109   TheLoop->setLoopID(NewLoopID);
110 
111   // Update internal cache.
112   IsVectorized.Value = 1;
113 }
114 
allowVectorization(Function * F,Loop * L,bool VectorizeOnlyWhenForced) const115 bool LoopVectorizeHints::allowVectorization(
116     Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
117   if (getForce() == LoopVectorizeHints::FK_Disabled) {
118     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
119     emitRemarkWithHints();
120     return false;
121   }
122 
123   if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
124     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
125     emitRemarkWithHints();
126     return false;
127   }
128 
129   if (getIsVectorized() == 1) {
130     LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
131     // FIXME: Add interleave.disable metadata. This will allow
132     // vectorize.disable to be used without disabling the pass and errors
133     // to differentiate between disabled vectorization and a width of 1.
134     ORE.emit([&]() {
135       return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
136                                         "AllDisabled", L->getStartLoc(),
137                                         L->getHeader())
138              << "loop not vectorized: vectorization and interleaving are "
139                 "explicitly disabled, or the loop has already been "
140                 "vectorized";
141     });
142     return false;
143   }
144 
145   return true;
146 }
147 
emitRemarkWithHints() const148 void LoopVectorizeHints::emitRemarkWithHints() const {
149   using namespace ore;
150 
151   ORE.emit([&]() {
152     if (Force.Value == LoopVectorizeHints::FK_Disabled)
153       return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
154                                       TheLoop->getStartLoc(),
155                                       TheLoop->getHeader())
156              << "loop not vectorized: vectorization is explicitly disabled";
157     else {
158       OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
159                                  TheLoop->getStartLoc(), TheLoop->getHeader());
160       R << "loop not vectorized";
161       if (Force.Value == LoopVectorizeHints::FK_Enabled) {
162         R << " (Force=" << NV("Force", true);
163         if (Width.Value != 0)
164           R << ", Vector Width=" << NV("VectorWidth", Width.Value);
165         if (Interleave.Value != 0)
166           R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
167         R << ")";
168       }
169       return R;
170     }
171   });
172 }
173 
vectorizeAnalysisPassName() const174 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
175   if (getWidth() == 1)
176     return LV_NAME;
177   if (getForce() == LoopVectorizeHints::FK_Disabled)
178     return LV_NAME;
179   if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
180     return LV_NAME;
181   return OptimizationRemarkAnalysis::AlwaysPrint;
182 }
183 
getHintsFromMetadata()184 void LoopVectorizeHints::getHintsFromMetadata() {
185   MDNode *LoopID = TheLoop->getLoopID();
186   if (!LoopID)
187     return;
188 
189   // First operand should refer to the loop id itself.
190   assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
191   assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
192 
193   for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
194     const MDString *S = nullptr;
195     SmallVector<Metadata *, 4> Args;
196 
197     // The expected hint is either a MDString or a MDNode with the first
198     // operand a MDString.
199     if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
200       if (!MD || MD->getNumOperands() == 0)
201         continue;
202       S = dyn_cast<MDString>(MD->getOperand(0));
203       for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
204         Args.push_back(MD->getOperand(i));
205     } else {
206       S = dyn_cast<MDString>(LoopID->getOperand(i));
207       assert(Args.size() == 0 && "too many arguments for MDString");
208     }
209 
210     if (!S)
211       continue;
212 
213     // Check if the hint starts with the loop metadata prefix.
214     StringRef Name = S->getString();
215     if (Args.size() == 1)
216       setHint(Name, Args[0]);
217   }
218 }
219 
setHint(StringRef Name,Metadata * Arg)220 void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
221   if (!Name.startswith(Prefix()))
222     return;
223   Name = Name.substr(Prefix().size(), StringRef::npos);
224 
225   const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
226   if (!C)
227     return;
228   unsigned Val = C->getZExtValue();
229 
230   Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
231   for (auto H : Hints) {
232     if (Name == H->Name) {
233       if (H->validate(Val))
234         H->Value = Val;
235       else
236         LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
237       break;
238     }
239   }
240 }
241 
doesNotMeet(Function * F,Loop * L,const LoopVectorizeHints & Hints)242 bool LoopVectorizationRequirements::doesNotMeet(
243     Function *F, Loop *L, const LoopVectorizeHints &Hints) {
244   const char *PassName = Hints.vectorizeAnalysisPassName();
245   bool Failed = false;
246   if (UnsafeAlgebraInst && !Hints.allowReordering()) {
247     ORE.emit([&]() {
248       return OptimizationRemarkAnalysisFPCommute(
249                  PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
250                  UnsafeAlgebraInst->getParent())
251              << "loop not vectorized: cannot prove it is safe to reorder "
252                 "floating-point operations";
253     });
254     Failed = true;
255   }
256 
257   // Test if runtime memcheck thresholds are exceeded.
258   bool PragmaThresholdReached =
259       NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
260   bool ThresholdReached =
261       NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
262   if ((ThresholdReached && !Hints.allowReordering()) ||
263       PragmaThresholdReached) {
264     ORE.emit([&]() {
265       return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
266                                                 L->getStartLoc(),
267                                                 L->getHeader())
268              << "loop not vectorized: cannot prove it is safe to reorder "
269                 "memory operations";
270     });
271     LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
272     Failed = true;
273   }
274 
275   return Failed;
276 }
277 
278 // Return true if the inner loop \p Lp is uniform with regard to the outer loop
279 // \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
280 // executing the inner loop will execute the same iterations). This check is
281 // very constrained for now but it will be relaxed in the future. \p Lp is
282 // considered uniform if it meets all the following conditions:
283 //   1) it has a canonical IV (starting from 0 and with stride 1),
284 //   2) its latch terminator is a conditional branch and,
285 //   3) its latch condition is a compare instruction whose operands are the
286 //      canonical IV and an OuterLp invariant.
287 // This check doesn't take into account the uniformity of other conditions not
288 // related to the loop latch because they don't affect the loop uniformity.
289 //
290 // NOTE: We decided to keep all these checks and its associated documentation
291 // together so that we can easily have a picture of the current supported loop
292 // nests. However, some of the current checks don't depend on \p OuterLp and
293 // would be redundantly executed for each \p Lp if we invoked this function for
294 // different candidate outer loops. This is not the case for now because we
295 // don't currently have the infrastructure to evaluate multiple candidate outer
296 // loops and \p OuterLp will be a fixed parameter while we only support explicit
297 // outer loop vectorization. It's also very likely that these checks go away
298 // before introducing the aforementioned infrastructure. However, if this is not
299 // the case, we should move the \p OuterLp independent checks to a separate
300 // function that is only executed once for each \p Lp.
isUniformLoop(Loop * Lp,Loop * OuterLp)301 static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
302   assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
303 
304   // If Lp is the outer loop, it's uniform by definition.
305   if (Lp == OuterLp)
306     return true;
307   assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
308 
309   // 1.
310   PHINode *IV = Lp->getCanonicalInductionVariable();
311   if (!IV) {
312     LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
313     return false;
314   }
315 
316   // 2.
317   BasicBlock *Latch = Lp->getLoopLatch();
318   auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
319   if (!LatchBr || LatchBr->isUnconditional()) {
320     LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
321     return false;
322   }
323 
324   // 3.
325   auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
326   if (!LatchCmp) {
327     LLVM_DEBUG(
328         dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
329     return false;
330   }
331 
332   Value *CondOp0 = LatchCmp->getOperand(0);
333   Value *CondOp1 = LatchCmp->getOperand(1);
334   Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
335   if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
336       !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
337     LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
338     return false;
339   }
340 
341   return true;
342 }
343 
344 // Return true if \p Lp and all its nested loops are uniform with regard to \p
345 // OuterLp.
isUniformLoopNest(Loop * Lp,Loop * OuterLp)346 static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
347   if (!isUniformLoop(Lp, OuterLp))
348     return false;
349 
350   // Check if nested loops are uniform.
351   for (Loop *SubLp : *Lp)
352     if (!isUniformLoopNest(SubLp, OuterLp))
353       return false;
354 
355   return true;
356 }
357 
358 /// Check whether it is safe to if-convert this phi node.
359 ///
360 /// Phi nodes with constant expressions that can trap are not safe to if
361 /// convert.
canIfConvertPHINodes(BasicBlock * BB)362 static bool canIfConvertPHINodes(BasicBlock *BB) {
363   for (PHINode &Phi : BB->phis()) {
364     for (Value *V : Phi.incoming_values())
365       if (auto *C = dyn_cast<Constant>(V))
366         if (C->canTrap())
367           return false;
368   }
369   return true;
370 }
371 
convertPointerToIntegerType(const DataLayout & DL,Type * Ty)372 static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
373   if (Ty->isPointerTy())
374     return DL.getIntPtrType(Ty);
375 
376   // It is possible that char's or short's overflow when we ask for the loop's
377   // trip count, work around this by changing the type size.
378   if (Ty->getScalarSizeInBits() < 32)
379     return Type::getInt32Ty(Ty->getContext());
380 
381   return Ty;
382 }
383 
getWiderType(const DataLayout & DL,Type * Ty0,Type * Ty1)384 static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
385   Ty0 = convertPointerToIntegerType(DL, Ty0);
386   Ty1 = convertPointerToIntegerType(DL, Ty1);
387   if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
388     return Ty0;
389   return Ty1;
390 }
391 
392 /// Check that the instruction has outside loop users and is not an
393 /// identified reduction variable.
hasOutsideLoopUser(const Loop * TheLoop,Instruction * Inst,SmallPtrSetImpl<Value * > & AllowedExit)394 static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
395                                SmallPtrSetImpl<Value *> &AllowedExit) {
396   // Reductions, Inductions and non-header phis are allowed to have exit users. All
397   // other instructions must not have external users.
398   if (!AllowedExit.count(Inst))
399     // Check that all of the users of the loop are inside the BB.
400     for (User *U : Inst->users()) {
401       Instruction *UI = cast<Instruction>(U);
402       // This user may be a reduction exit value.
403       if (!TheLoop->contains(UI)) {
404         LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
405         return true;
406       }
407     }
408   return false;
409 }
410 
isConsecutivePtr(Value * Ptr)411 int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
412   const ValueToValueMap &Strides =
413       getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
414 
415   bool CanAddPredicate = !TheLoop->getHeader()->getParent()->hasOptSize();
416   int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
417   if (Stride == 1 || Stride == -1)
418     return Stride;
419   return 0;
420 }
421 
isUniform(Value * V)422 bool LoopVectorizationLegality::isUniform(Value *V) {
423   return LAI->isUniform(V);
424 }
425 
canVectorizeOuterLoop()426 bool LoopVectorizationLegality::canVectorizeOuterLoop() {
427   assert(!TheLoop->empty() && "We are not vectorizing an outer loop.");
428   // Store the result and return it at the end instead of exiting early, in case
429   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
430   bool Result = true;
431   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
432 
433   for (BasicBlock *BB : TheLoop->blocks()) {
434     // Check whether the BB terminator is a BranchInst. Any other terminator is
435     // not supported yet.
436     auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
437     if (!Br) {
438       reportVectorizationFailure("Unsupported basic block terminator",
439           "loop control flow is not understood by vectorizer",
440           "CFGNotUnderstood", ORE, TheLoop);
441       if (DoExtraAnalysis)
442         Result = false;
443       else
444         return false;
445     }
446 
447     // Check whether the BranchInst is a supported one. Only unconditional
448     // branches, conditional branches with an outer loop invariant condition or
449     // backedges are supported.
450     // FIXME: We skip these checks when VPlan predication is enabled as we
451     // want to allow divergent branches. This whole check will be removed
452     // once VPlan predication is on by default.
453     if (!EnableVPlanPredication && Br && Br->isConditional() &&
454         !TheLoop->isLoopInvariant(Br->getCondition()) &&
455         !LI->isLoopHeader(Br->getSuccessor(0)) &&
456         !LI->isLoopHeader(Br->getSuccessor(1))) {
457       reportVectorizationFailure("Unsupported conditional branch",
458           "loop control flow is not understood by vectorizer",
459           "CFGNotUnderstood", ORE, TheLoop);
460       if (DoExtraAnalysis)
461         Result = false;
462       else
463         return false;
464     }
465   }
466 
467   // Check whether inner loops are uniform. At this point, we only support
468   // simple outer loops scenarios with uniform nested loops.
469   if (!isUniformLoopNest(TheLoop /*loop nest*/,
470                          TheLoop /*context outer loop*/)) {
471     reportVectorizationFailure("Outer loop contains divergent loops",
472         "loop control flow is not understood by vectorizer",
473         "CFGNotUnderstood", ORE, TheLoop);
474     if (DoExtraAnalysis)
475       Result = false;
476     else
477       return false;
478   }
479 
480   // Check whether we are able to set up outer loop induction.
481   if (!setupOuterLoopInductions()) {
482     reportVectorizationFailure("Unsupported outer loop Phi(s)",
483                                "Unsupported outer loop Phi(s)",
484                                "UnsupportedPhi", ORE, TheLoop);
485     if (DoExtraAnalysis)
486       Result = false;
487     else
488       return false;
489   }
490 
491   return Result;
492 }
493 
addInductionPhi(PHINode * Phi,const InductionDescriptor & ID,SmallPtrSetImpl<Value * > & AllowedExit)494 void LoopVectorizationLegality::addInductionPhi(
495     PHINode *Phi, const InductionDescriptor &ID,
496     SmallPtrSetImpl<Value *> &AllowedExit) {
497   Inductions[Phi] = ID;
498 
499   // In case this induction also comes with casts that we know we can ignore
500   // in the vectorized loop body, record them here. All casts could be recorded
501   // here for ignoring, but suffices to record only the first (as it is the
502   // only one that may bw used outside the cast sequence).
503   const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
504   if (!Casts.empty())
505     InductionCastsToIgnore.insert(*Casts.begin());
506 
507   Type *PhiTy = Phi->getType();
508   const DataLayout &DL = Phi->getModule()->getDataLayout();
509 
510   // Get the widest type.
511   if (!PhiTy->isFloatingPointTy()) {
512     if (!WidestIndTy)
513       WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
514     else
515       WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
516   }
517 
518   // Int inductions are special because we only allow one IV.
519   if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
520       ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
521       isa<Constant>(ID.getStartValue()) &&
522       cast<Constant>(ID.getStartValue())->isNullValue()) {
523 
524     // Use the phi node with the widest type as induction. Use the last
525     // one if there are multiple (no good reason for doing this other
526     // than it is expedient). We've checked that it begins at zero and
527     // steps by one, so this is a canonical induction variable.
528     if (!PrimaryInduction || PhiTy == WidestIndTy)
529       PrimaryInduction = Phi;
530   }
531 
532   // Both the PHI node itself, and the "post-increment" value feeding
533   // back into the PHI node may have external users.
534   // We can allow those uses, except if the SCEVs we have for them rely
535   // on predicates that only hold within the loop, since allowing the exit
536   // currently means re-using this SCEV outside the loop (see PR33706 for more
537   // details).
538   if (PSE.getUnionPredicate().isAlwaysTrue()) {
539     AllowedExit.insert(Phi);
540     AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
541   }
542 
543   LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
544 }
545 
setupOuterLoopInductions()546 bool LoopVectorizationLegality::setupOuterLoopInductions() {
547   BasicBlock *Header = TheLoop->getHeader();
548 
549   // Returns true if a given Phi is a supported induction.
550   auto isSupportedPhi = [&](PHINode &Phi) -> bool {
551     InductionDescriptor ID;
552     if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
553         ID.getKind() == InductionDescriptor::IK_IntInduction) {
554       addInductionPhi(&Phi, ID, AllowedExit);
555       return true;
556     } else {
557       // Bail out for any Phi in the outer loop header that is not a supported
558       // induction.
559       LLVM_DEBUG(
560           dbgs()
561           << "LV: Found unsupported PHI for outer loop vectorization.\n");
562       return false;
563     }
564   };
565 
566   if (llvm::all_of(Header->phis(), isSupportedPhi))
567     return true;
568   else
569     return false;
570 }
571 
572 /// Checks if a function is scalarizable according to the TLI, in
573 /// the sense that it should be vectorized and then expanded in
574 /// multiple scalarcalls. This is represented in the
575 /// TLI via mappings that do not specify a vector name, as in the
576 /// following example:
577 ///
578 ///    const VecDesc VecIntrinsics[] = {
579 ///      {"llvm.phx.abs.i32", "", 4}
580 ///    };
isTLIScalarize(const TargetLibraryInfo & TLI,const CallInst & CI)581 static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
582   const StringRef ScalarName = CI.getCalledFunction()->getName();
583   bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
584   // Check that all known VFs are not associated to a vector
585   // function, i.e. the vector name is emty.
586   if (Scalarize)
587     for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
588          VF <= WidestVF; VF *= 2) {
589       Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
590     }
591   return Scalarize;
592 }
593 
canVectorizeInstrs()594 bool LoopVectorizationLegality::canVectorizeInstrs() {
595   BasicBlock *Header = TheLoop->getHeader();
596 
597   // Look for the attribute signaling the absence of NaNs.
598   Function &F = *Header->getParent();
599   HasFunNoNaNAttr =
600       F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
601 
602   // For each block in the loop.
603   for (BasicBlock *BB : TheLoop->blocks()) {
604     // Scan the instructions in the block and look for hazards.
605     for (Instruction &I : *BB) {
606       if (auto *Phi = dyn_cast<PHINode>(&I)) {
607         Type *PhiTy = Phi->getType();
608         // Check that this PHI type is allowed.
609         if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
610             !PhiTy->isPointerTy()) {
611           reportVectorizationFailure("Found a non-int non-pointer PHI",
612                                      "loop control flow is not understood by vectorizer",
613                                      "CFGNotUnderstood", ORE, TheLoop);
614           return false;
615         }
616 
617         // If this PHINode is not in the header block, then we know that we
618         // can convert it to select during if-conversion. No need to check if
619         // the PHIs in this block are induction or reduction variables.
620         if (BB != Header) {
621           // Non-header phi nodes that have outside uses can be vectorized. Add
622           // them to the list of allowed exits.
623           // Unsafe cyclic dependencies with header phis are identified during
624           // legalization for reduction, induction and first order
625           // recurrences.
626           AllowedExit.insert(&I);
627           continue;
628         }
629 
630         // We only allow if-converted PHIs with exactly two incoming values.
631         if (Phi->getNumIncomingValues() != 2) {
632           reportVectorizationFailure("Found an invalid PHI",
633               "loop control flow is not understood by vectorizer",
634               "CFGNotUnderstood", ORE, TheLoop, Phi);
635           return false;
636         }
637 
638         RecurrenceDescriptor RedDes;
639         if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
640                                                  DT)) {
641           if (RedDes.hasUnsafeAlgebra())
642             Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
643           AllowedExit.insert(RedDes.getLoopExitInstr());
644           Reductions[Phi] = RedDes;
645           continue;
646         }
647 
648         // TODO: Instead of recording the AllowedExit, it would be good to record the
649         // complementary set: NotAllowedExit. These include (but may not be
650         // limited to):
651         // 1. Reduction phis as they represent the one-before-last value, which
652         // is not available when vectorized
653         // 2. Induction phis and increment when SCEV predicates cannot be used
654         // outside the loop - see addInductionPhi
655         // 3. Non-Phis with outside uses when SCEV predicates cannot be used
656         // outside the loop - see call to hasOutsideLoopUser in the non-phi
657         // handling below
658         // 4. FirstOrderRecurrence phis that can possibly be handled by
659         // extraction.
660         // By recording these, we can then reason about ways to vectorize each
661         // of these NotAllowedExit.
662         InductionDescriptor ID;
663         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
664           addInductionPhi(Phi, ID, AllowedExit);
665           if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
666             Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
667           continue;
668         }
669 
670         if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
671                                                          SinkAfter, DT)) {
672           AllowedExit.insert(Phi);
673           FirstOrderRecurrences.insert(Phi);
674           continue;
675         }
676 
677         // As a last resort, coerce the PHI to a AddRec expression
678         // and re-try classifying it a an induction PHI.
679         if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
680           addInductionPhi(Phi, ID, AllowedExit);
681           continue;
682         }
683 
684         reportVectorizationFailure("Found an unidentified PHI",
685             "value that could not be identified as "
686             "reduction is used outside the loop",
687             "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
688         return false;
689       } // end of PHI handling
690 
691       // We handle calls that:
692       //   * Are debug info intrinsics.
693       //   * Have a mapping to an IR intrinsic.
694       //   * Have a vector version available.
695       auto *CI = dyn_cast<CallInst>(&I);
696 
697       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
698           !isa<DbgInfoIntrinsic>(CI) &&
699           !(CI->getCalledFunction() && TLI &&
700             (!VFDatabase::getMappings(*CI).empty() ||
701              isTLIScalarize(*TLI, *CI)))) {
702         // If the call is a recognized math libary call, it is likely that
703         // we can vectorize it given loosened floating-point constraints.
704         LibFunc Func;
705         bool IsMathLibCall =
706             TLI && CI->getCalledFunction() &&
707             CI->getType()->isFloatingPointTy() &&
708             TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
709             TLI->hasOptimizedCodeGen(Func);
710 
711         if (IsMathLibCall) {
712           // TODO: Ideally, we should not use clang-specific language here,
713           // but it's hard to provide meaningful yet generic advice.
714           // Also, should this be guarded by allowExtraAnalysis() and/or be part
715           // of the returned info from isFunctionVectorizable()?
716           reportVectorizationFailure(
717               "Found a non-intrinsic callsite",
718               "library call cannot be vectorized. "
719               "Try compiling with -fno-math-errno, -ffast-math, "
720               "or similar flags",
721               "CantVectorizeLibcall", ORE, TheLoop, CI);
722         } else {
723           reportVectorizationFailure("Found a non-intrinsic callsite",
724                                      "call instruction cannot be vectorized",
725                                      "CantVectorizeLibcall", ORE, TheLoop, CI);
726         }
727         return false;
728       }
729 
730       // Some intrinsics have scalar arguments and should be same in order for
731       // them to be vectorized (i.e. loop invariant).
732       if (CI) {
733         auto *SE = PSE.getSE();
734         Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
735         for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
736           if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
737             if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
738               reportVectorizationFailure("Found unvectorizable intrinsic",
739                   "intrinsic instruction cannot be vectorized",
740                   "CantVectorizeIntrinsic", ORE, TheLoop, CI);
741               return false;
742             }
743           }
744       }
745 
746       // Check that the instruction return type is vectorizable.
747       // Also, we can't vectorize extractelement instructions.
748       if ((!VectorType::isValidElementType(I.getType()) &&
749            !I.getType()->isVoidTy()) ||
750           isa<ExtractElementInst>(I)) {
751         reportVectorizationFailure("Found unvectorizable type",
752             "instruction return type cannot be vectorized",
753             "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
754         return false;
755       }
756 
757       // Check that the stored type is vectorizable.
758       if (auto *ST = dyn_cast<StoreInst>(&I)) {
759         Type *T = ST->getValueOperand()->getType();
760         if (!VectorType::isValidElementType(T)) {
761           reportVectorizationFailure("Store instruction cannot be vectorized",
762                                      "store instruction cannot be vectorized",
763                                      "CantVectorizeStore", ORE, TheLoop, ST);
764           return false;
765         }
766 
767         // For nontemporal stores, check that a nontemporal vector version is
768         // supported on the target.
769         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
770           // Arbitrarily try a vector of 2 elements.
771           auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
772           assert(VecTy && "did not find vectorized version of stored type");
773           if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
774             reportVectorizationFailure(
775                 "nontemporal store instruction cannot be vectorized",
776                 "nontemporal store instruction cannot be vectorized",
777                 "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
778             return false;
779           }
780         }
781 
782       } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
783         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
784           // For nontemporal loads, check that a nontemporal vector version is
785           // supported on the target (arbitrarily try a vector of 2 elements).
786           auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
787           assert(VecTy && "did not find vectorized version of load type");
788           if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
789             reportVectorizationFailure(
790                 "nontemporal load instruction cannot be vectorized",
791                 "nontemporal load instruction cannot be vectorized",
792                 "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
793             return false;
794           }
795         }
796 
797         // FP instructions can allow unsafe algebra, thus vectorizable by
798         // non-IEEE-754 compliant SIMD units.
799         // This applies to floating-point math operations and calls, not memory
800         // operations, shuffles, or casts, as they don't change precision or
801         // semantics.
802       } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
803                  !I.isFast()) {
804         LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
805         Hints->setPotentiallyUnsafe();
806       }
807 
808       // Reduction instructions are allowed to have exit users.
809       // All other instructions must not have external users.
810       if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
811         // We can safely vectorize loops where instructions within the loop are
812         // used outside the loop only if the SCEV predicates within the loop is
813         // same as outside the loop. Allowing the exit means reusing the SCEV
814         // outside the loop.
815         if (PSE.getUnionPredicate().isAlwaysTrue()) {
816           AllowedExit.insert(&I);
817           continue;
818         }
819         reportVectorizationFailure("Value cannot be used outside the loop",
820                                    "value cannot be used outside the loop",
821                                    "ValueUsedOutsideLoop", ORE, TheLoop, &I);
822         return false;
823       }
824     } // next instr.
825   }
826 
827   if (!PrimaryInduction) {
828     if (Inductions.empty()) {
829       reportVectorizationFailure("Did not find one integer induction var",
830           "loop induction variable could not be identified",
831           "NoInductionVariable", ORE, TheLoop);
832       return false;
833     } else if (!WidestIndTy) {
834       reportVectorizationFailure("Did not find one integer induction var",
835           "integer loop induction variable could not be identified",
836           "NoIntegerInductionVariable", ORE, TheLoop);
837       return false;
838     } else {
839       LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
840     }
841   }
842 
843   // For first order recurrences, we use the previous value (incoming value from
844   // the latch) to check if it dominates all users of the recurrence. Bail out
845   // if we have to sink such an instruction for another recurrence, as the
846   // dominance requirement may not hold after sinking.
847   BasicBlock *LoopLatch = TheLoop->getLoopLatch();
848   if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
849         Instruction *V =
850             cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
851         return SinkAfter.find(V) != SinkAfter.end();
852       }))
853     return false;
854 
855   // Now we know the widest induction type, check if our found induction
856   // is the same size. If it's not, unset it here and InnerLoopVectorizer
857   // will create another.
858   if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
859     PrimaryInduction = nullptr;
860 
861   return true;
862 }
863 
canVectorizeMemory()864 bool LoopVectorizationLegality::canVectorizeMemory() {
865   LAI = &(*GetLAA)(*TheLoop);
866   const OptimizationRemarkAnalysis *LAR = LAI->getReport();
867   if (LAR) {
868     ORE->emit([&]() {
869       return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
870                                         "loop not vectorized: ", *LAR);
871     });
872   }
873   if (!LAI->canVectorizeMemory())
874     return false;
875 
876   if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
877     reportVectorizationFailure("Stores to a uniform address",
878         "write to a loop invariant address could not be vectorized",
879         "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
880     return false;
881   }
882   Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
883   PSE.addPredicate(LAI->getPSE().getUnionPredicate());
884 
885   return true;
886 }
887 
isInductionPhi(const Value * V)888 bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
889   Value *In0 = const_cast<Value *>(V);
890   PHINode *PN = dyn_cast_or_null<PHINode>(In0);
891   if (!PN)
892     return false;
893 
894   return Inductions.count(PN);
895 }
896 
isCastedInductionVariable(const Value * V)897 bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
898   auto *Inst = dyn_cast<Instruction>(V);
899   return (Inst && InductionCastsToIgnore.count(Inst));
900 }
901 
isInductionVariable(const Value * V)902 bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
903   return isInductionPhi(V) || isCastedInductionVariable(V);
904 }
905 
isFirstOrderRecurrence(const PHINode * Phi)906 bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
907   return FirstOrderRecurrences.count(Phi);
908 }
909 
blockNeedsPredication(BasicBlock * BB)910 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
911   return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
912 }
913 
blockCanBePredicated(BasicBlock * BB,SmallPtrSetImpl<Value * > & SafePtrs,bool PreserveGuards)914 bool LoopVectorizationLegality::blockCanBePredicated(
915     BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
916   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
917 
918   for (Instruction &I : *BB) {
919     // Check that we don't have a constant expression that can trap as operand.
920     for (Value *Operand : I.operands()) {
921       if (auto *C = dyn_cast<Constant>(Operand))
922         if (C->canTrap())
923           return false;
924     }
925 
926     // We can predicate blocks with calls to assume, as long as we drop them in
927     // case we flatten the CFG via predication.
928     if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
929       ConditionalAssumes.insert(&I);
930       continue;
931     }
932 
933     // We might be able to hoist the load.
934     if (I.mayReadFromMemory()) {
935       auto *LI = dyn_cast<LoadInst>(&I);
936       if (!LI)
937         return false;
938       if (!SafePtrs.count(LI->getPointerOperand())) {
939         // !llvm.mem.parallel_loop_access implies if-conversion safety.
940         // Otherwise, record that the load needs (real or emulated) masking
941         // and let the cost model decide.
942         if (!IsAnnotatedParallel || PreserveGuards)
943           MaskedOp.insert(LI);
944         continue;
945       }
946     }
947 
948     if (I.mayWriteToMemory()) {
949       auto *SI = dyn_cast<StoreInst>(&I);
950       if (!SI)
951         return false;
952       // Predicated store requires some form of masking:
953       // 1) masked store HW instruction,
954       // 2) emulation via load-blend-store (only if safe and legal to do so,
955       //    be aware on the race conditions), or
956       // 3) element-by-element predicate check and scalar store.
957       MaskedOp.insert(SI);
958       continue;
959     }
960     if (I.mayThrow())
961       return false;
962   }
963 
964   return true;
965 }
966 
canVectorizeWithIfConvert()967 bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
968   if (!EnableIfConversion) {
969     reportVectorizationFailure("If-conversion is disabled",
970                                "if-conversion is disabled",
971                                "IfConversionDisabled",
972                                ORE, TheLoop);
973     return false;
974   }
975 
976   assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
977 
978   // A list of pointers which are known to be dereferenceable within scope of
979   // the loop body for each iteration of the loop which executes.  That is,
980   // the memory pointed to can be dereferenced (with the access size implied by
981   // the value's type) unconditionally within the loop header without
982   // introducing a new fault.
983   SmallPtrSet<Value *, 8> SafePointers;
984 
985   // Collect safe addresses.
986   for (BasicBlock *BB : TheLoop->blocks()) {
987     if (!blockNeedsPredication(BB)) {
988       for (Instruction &I : *BB)
989         if (auto *Ptr = getLoadStorePointerOperand(&I))
990           SafePointers.insert(Ptr);
991       continue;
992     }
993 
994     // For a block which requires predication, a address may be safe to access
995     // in the loop w/o predication if we can prove dereferenceability facts
996     // sufficient to ensure it'll never fault within the loop. For the moment,
997     // we restrict this to loads; stores are more complicated due to
998     // concurrency restrictions.
999     ScalarEvolution &SE = *PSE.getSE();
1000     for (Instruction &I : *BB) {
1001       LoadInst *LI = dyn_cast<LoadInst>(&I);
1002       if (LI && !mustSuppressSpeculation(*LI) &&
1003           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
1004         SafePointers.insert(LI->getPointerOperand());
1005     }
1006   }
1007 
1008   // Collect the blocks that need predication.
1009   BasicBlock *Header = TheLoop->getHeader();
1010   for (BasicBlock *BB : TheLoop->blocks()) {
1011     // We don't support switch statements inside loops.
1012     if (!isa<BranchInst>(BB->getTerminator())) {
1013       reportVectorizationFailure("Loop contains a switch statement",
1014                                  "loop contains a switch statement",
1015                                  "LoopContainsSwitch", ORE, TheLoop,
1016                                  BB->getTerminator());
1017       return false;
1018     }
1019 
1020     // We must be able to predicate all blocks that need to be predicated.
1021     if (blockNeedsPredication(BB)) {
1022       if (!blockCanBePredicated(BB, SafePointers)) {
1023         reportVectorizationFailure(
1024             "Control flow cannot be substituted for a select",
1025             "control flow cannot be substituted for a select",
1026             "NoCFGForSelect", ORE, TheLoop,
1027             BB->getTerminator());
1028         return false;
1029       }
1030     } else if (BB != Header && !canIfConvertPHINodes(BB)) {
1031       reportVectorizationFailure(
1032           "Control flow cannot be substituted for a select",
1033           "control flow cannot be substituted for a select",
1034           "NoCFGForSelect", ORE, TheLoop,
1035           BB->getTerminator());
1036       return false;
1037     }
1038   }
1039 
1040   // We can if-convert this loop.
1041   return true;
1042 }
1043 
1044 // Helper function to canVectorizeLoopNestCFG.
canVectorizeLoopCFG(Loop * Lp,bool UseVPlanNativePath)1045 bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
1046                                                     bool UseVPlanNativePath) {
1047   assert((UseVPlanNativePath || Lp->empty()) &&
1048          "VPlan-native path is not enabled.");
1049 
1050   // TODO: ORE should be improved to show more accurate information when an
1051   // outer loop can't be vectorized because a nested loop is not understood or
1052   // legal. Something like: "outer_loop_location: loop not vectorized:
1053   // (inner_loop_location) loop control flow is not understood by vectorizer".
1054 
1055   // Store the result and return it at the end instead of exiting early, in case
1056   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1057   bool Result = true;
1058   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1059 
1060   // We must have a loop in canonical form. Loops with indirectbr in them cannot
1061   // be canonicalized.
1062   if (!Lp->getLoopPreheader()) {
1063     reportVectorizationFailure("Loop doesn't have a legal pre-header",
1064         "loop control flow is not understood by vectorizer",
1065         "CFGNotUnderstood", ORE, TheLoop);
1066     if (DoExtraAnalysis)
1067       Result = false;
1068     else
1069       return false;
1070   }
1071 
1072   // We must have a single backedge.
1073   if (Lp->getNumBackEdges() != 1) {
1074     reportVectorizationFailure("The loop must have a single backedge",
1075         "loop control flow is not understood by vectorizer",
1076         "CFGNotUnderstood", ORE, TheLoop);
1077     if (DoExtraAnalysis)
1078       Result = false;
1079     else
1080       return false;
1081   }
1082 
1083   // We must have a single exiting block.
1084   if (!Lp->getExitingBlock()) {
1085     reportVectorizationFailure("The loop must have an exiting block",
1086         "loop control flow is not understood by vectorizer",
1087         "CFGNotUnderstood", ORE, TheLoop);
1088     if (DoExtraAnalysis)
1089       Result = false;
1090     else
1091       return false;
1092   }
1093 
1094   // We only handle bottom-tested loops, i.e. loop in which the condition is
1095   // checked at the end of each iteration. With that we can assume that all
1096   // instructions in the loop are executed the same number of times.
1097   if (Lp->getExitingBlock() != Lp->getLoopLatch()) {
1098     reportVectorizationFailure("The exiting block is not the loop latch",
1099         "loop control flow is not understood by vectorizer",
1100         "CFGNotUnderstood", ORE, TheLoop);
1101     if (DoExtraAnalysis)
1102       Result = false;
1103     else
1104       return false;
1105   }
1106 
1107   return Result;
1108 }
1109 
canVectorizeLoopNestCFG(Loop * Lp,bool UseVPlanNativePath)1110 bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
1111     Loop *Lp, bool UseVPlanNativePath) {
1112   // Store the result and return it at the end instead of exiting early, in case
1113   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1114   bool Result = true;
1115   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1116   if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
1117     if (DoExtraAnalysis)
1118       Result = false;
1119     else
1120       return false;
1121   }
1122 
1123   // Recursively check whether the loop control flow of nested loops is
1124   // understood.
1125   for (Loop *SubLp : *Lp)
1126     if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
1127       if (DoExtraAnalysis)
1128         Result = false;
1129       else
1130         return false;
1131     }
1132 
1133   return Result;
1134 }
1135 
canVectorize(bool UseVPlanNativePath)1136 bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
1137   // Store the result and return it at the end instead of exiting early, in case
1138   // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
1139   bool Result = true;
1140 
1141   bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
1142   // Check whether the loop-related control flow in the loop nest is expected by
1143   // vectorizer.
1144   if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
1145     if (DoExtraAnalysis)
1146       Result = false;
1147     else
1148       return false;
1149   }
1150 
1151   // We need to have a loop header.
1152   LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
1153                     << '\n');
1154 
1155   // Specific checks for outer loops. We skip the remaining legal checks at this
1156   // point because they don't support outer loops.
1157   if (!TheLoop->empty()) {
1158     assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
1159 
1160     if (!canVectorizeOuterLoop()) {
1161       reportVectorizationFailure("Unsupported outer loop",
1162                                  "unsupported outer loop",
1163                                  "UnsupportedOuterLoop",
1164                                  ORE, TheLoop);
1165       // TODO: Implement DoExtraAnalysis when subsequent legal checks support
1166       // outer loops.
1167       return false;
1168     }
1169 
1170     LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
1171     return Result;
1172   }
1173 
1174   assert(TheLoop->empty() && "Inner loop expected.");
1175   // Check if we can if-convert non-single-bb loops.
1176   unsigned NumBlocks = TheLoop->getNumBlocks();
1177   if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
1178     LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
1179     if (DoExtraAnalysis)
1180       Result = false;
1181     else
1182       return false;
1183   }
1184 
1185   // Check if we can vectorize the instructions and CFG in this loop.
1186   if (!canVectorizeInstrs()) {
1187     LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
1188     if (DoExtraAnalysis)
1189       Result = false;
1190     else
1191       return false;
1192   }
1193 
1194   // Go over each instruction and look at memory deps.
1195   if (!canVectorizeMemory()) {
1196     LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
1197     if (DoExtraAnalysis)
1198       Result = false;
1199     else
1200       return false;
1201   }
1202 
1203   LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
1204                     << (LAI->getRuntimePointerChecking()->Need
1205                             ? " (with a runtime bound check)"
1206                             : "")
1207                     << "!\n");
1208 
1209   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
1210   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
1211     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
1212 
1213   if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
1214     reportVectorizationFailure("Too many SCEV checks needed",
1215         "Too many SCEV assumptions need to be made and checked at runtime",
1216         "TooManySCEVRunTimeChecks", ORE, TheLoop);
1217     if (DoExtraAnalysis)
1218       Result = false;
1219     else
1220       return false;
1221   }
1222 
1223   // Okay! We've done all the tests. If any have failed, return false. Otherwise
1224   // we can vectorize, and at this point we don't have any other mem analysis
1225   // which may limit our maximum vectorization factor, so just return true with
1226   // no restrictions.
1227   return Result;
1228 }
1229 
prepareToFoldTailByMasking()1230 bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
1231 
1232   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
1233 
1234   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
1235 
1236   for (auto &Reduction : getReductionVars())
1237     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
1238 
1239   // TODO: handle non-reduction outside users when tail is folded by masking.
1240   for (auto *AE : AllowedExit) {
1241     // Check that all users of allowed exit values are inside the loop or
1242     // are the live-out of a reduction.
1243     if (ReductionLiveOuts.count(AE))
1244       continue;
1245     for (User *U : AE->users()) {
1246       Instruction *UI = cast<Instruction>(U);
1247       if (TheLoop->contains(UI))
1248         continue;
1249       reportVectorizationFailure(
1250           "Cannot fold tail by masking, loop has an outside user for",
1251           "Cannot fold tail by masking in the presence of live outs.",
1252           "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
1253       return false;
1254     }
1255   }
1256 
1257   // The list of pointers that we can safely read and write to remains empty.
1258   SmallPtrSet<Value *, 8> SafePointers;
1259 
1260   // Check and mark all blocks for predication, including those that ordinarily
1261   // do not need predication such as the header block.
1262   for (BasicBlock *BB : TheLoop->blocks()) {
1263     if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
1264       reportVectorizationFailure(
1265           "Cannot fold tail by masking as required",
1266           "control flow cannot be substituted for a select",
1267           "NoCFGForSelect", ORE, TheLoop,
1268           BB->getTerminator());
1269       return false;
1270     }
1271   }
1272 
1273   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
1274   return true;
1275 }
1276 
1277 } // namespace llvm
1278