1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "VPlan.h"
15 #include "VPlanAnalysis.h"
16 #include "llvm/ADT/STLExtras.h"
17 #include "llvm/ADT/SmallVector.h"
18 #include "llvm/ADT/Twine.h"
19 #include "llvm/Analysis/IVDescriptors.h"
20 #include "llvm/IR/BasicBlock.h"
21 #include "llvm/IR/IRBuilder.h"
22 #include "llvm/IR/Instruction.h"
23 #include "llvm/IR/Instructions.h"
24 #include "llvm/IR/Type.h"
25 #include "llvm/IR/Value.h"
26 #include "llvm/Support/Casting.h"
27 #include "llvm/Support/CommandLine.h"
28 #include "llvm/Support/Debug.h"
29 #include "llvm/Support/raw_ostream.h"
30 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
31 #include "llvm/Transforms/Utils/LoopUtils.h"
32 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
33 #include <cassert>
34 
35 using namespace llvm;
36 
37 using VectorParts = SmallVector<Value *, 2>;
38 
39 namespace llvm {
40 extern cl::opt<bool> EnableVPlanNativePath;
41 }
42 
43 #define LV_NAME "loop-vectorize"
44 #define DEBUG_TYPE LV_NAME
45 
46 bool VPRecipeBase::mayWriteToMemory() const {
47   switch (getVPDefID()) {
48   case VPInterleaveSC:
49     return cast<VPInterleaveRecipe>(this)->getNumStoreOperands() > 0;
50   case VPWidenMemoryInstructionSC: {
51     return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
52   }
53   case VPReplicateSC:
54   case VPWidenCallSC:
55     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
56         ->mayWriteToMemory();
57   case VPBranchOnMaskSC:
58   case VPScalarIVStepsSC:
59   case VPPredInstPHISC:
60     return false;
61   case VPBlendSC:
62   case VPReductionSC:
63   case VPWidenCanonicalIVSC:
64   case VPWidenCastSC:
65   case VPWidenGEPSC:
66   case VPWidenIntOrFpInductionSC:
67   case VPWidenPHISC:
68   case VPWidenSC:
69   case VPWidenSelectSC: {
70     const Instruction *I =
71         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
72     (void)I;
73     assert((!I || !I->mayWriteToMemory()) &&
74            "underlying instruction may write to memory");
75     return false;
76   }
77   default:
78     return true;
79   }
80 }
81 
82 bool VPRecipeBase::mayReadFromMemory() const {
83   switch (getVPDefID()) {
84   case VPWidenMemoryInstructionSC: {
85     return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
86   }
87   case VPReplicateSC:
88   case VPWidenCallSC:
89     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
90         ->mayReadFromMemory();
91   case VPBranchOnMaskSC:
92   case VPScalarIVStepsSC:
93   case VPPredInstPHISC:
94     return false;
95   case VPBlendSC:
96   case VPReductionSC:
97   case VPWidenCanonicalIVSC:
98   case VPWidenCastSC:
99   case VPWidenGEPSC:
100   case VPWidenIntOrFpInductionSC:
101   case VPWidenPHISC:
102   case VPWidenSC:
103   case VPWidenSelectSC: {
104     const Instruction *I =
105         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
106     (void)I;
107     assert((!I || !I->mayReadFromMemory()) &&
108            "underlying instruction may read from memory");
109     return false;
110   }
111   default:
112     return true;
113   }
114 }
115 
116 bool VPRecipeBase::mayHaveSideEffects() const {
117   switch (getVPDefID()) {
118   case VPDerivedIVSC:
119   case VPPredInstPHISC:
120     return false;
121   case VPInstructionSC:
122     switch (cast<VPInstruction>(this)->getOpcode()) {
123     case Instruction::Or:
124     case Instruction::ICmp:
125     case Instruction::Select:
126     case VPInstruction::Not:
127     case VPInstruction::CalculateTripCountMinusVF:
128     case VPInstruction::CanonicalIVIncrementForPart:
129       return false;
130     default:
131       return true;
132     }
133   case VPWidenCallSC:
134     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
135         ->mayHaveSideEffects();
136   case VPBlendSC:
137   case VPReductionSC:
138   case VPScalarIVStepsSC:
139   case VPWidenCanonicalIVSC:
140   case VPWidenCastSC:
141   case VPWidenGEPSC:
142   case VPWidenIntOrFpInductionSC:
143   case VPWidenPHISC:
144   case VPWidenPointerInductionSC:
145   case VPWidenSC:
146   case VPWidenSelectSC: {
147     const Instruction *I =
148         dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
149     (void)I;
150     assert((!I || !I->mayHaveSideEffects()) &&
151            "underlying instruction has side-effects");
152     return false;
153   }
154   case VPInterleaveSC:
155     return mayWriteToMemory();
156   case VPWidenMemoryInstructionSC:
157     assert(cast<VPWidenMemoryInstructionRecipe>(this)
158                    ->getIngredient()
159                    .mayHaveSideEffects() == mayWriteToMemory() &&
160            "mayHaveSideffects result for ingredient differs from this "
161            "implementation");
162     return mayWriteToMemory();
163   case VPReplicateSC: {
164     auto *R = cast<VPReplicateRecipe>(this);
165     return R->getUnderlyingInstr()->mayHaveSideEffects();
166   }
167   default:
168     return true;
169   }
170 }
171 
172 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
173   auto Lane = VPLane::getLastLaneForVF(State.VF);
174   VPValue *ExitValue = getOperand(0);
175   if (vputils::isUniformAfterVectorization(ExitValue))
176     Lane = VPLane::getFirstLane();
177   VPBasicBlock *MiddleVPBB =
178       cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSingleSuccessor());
179   assert(MiddleVPBB->getNumSuccessors() == 0 &&
180          "the middle block must not have any successors");
181   BasicBlock *MiddleBB = State.CFG.VPBB2IRBB[MiddleVPBB];
182   Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
183                    MiddleBB);
184 }
185 
186 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
187 void VPLiveOut::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
188   O << "Live-out ";
189   getPhi()->printAsOperand(O);
190   O << " = ";
191   getOperand(0)->printAsOperand(O, SlotTracker);
192   O << "\n";
193 }
194 #endif
195 
196 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
197   assert(!Parent && "Recipe already in some VPBasicBlock");
198   assert(InsertPos->getParent() &&
199          "Insertion position not in any VPBasicBlock");
200   Parent = InsertPos->getParent();
201   Parent->getRecipeList().insert(InsertPos->getIterator(), this);
202 }
203 
204 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
205                                 iplist<VPRecipeBase>::iterator I) {
206   assert(!Parent && "Recipe already in some VPBasicBlock");
207   assert(I == BB.end() || I->getParent() == &BB);
208   Parent = &BB;
209   BB.getRecipeList().insert(I, this);
210 }
211 
212 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
213   assert(!Parent && "Recipe already in some VPBasicBlock");
214   assert(InsertPos->getParent() &&
215          "Insertion position not in any VPBasicBlock");
216   Parent = InsertPos->getParent();
217   Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
218 }
219 
220 void VPRecipeBase::removeFromParent() {
221   assert(getParent() && "Recipe not in any VPBasicBlock");
222   getParent()->getRecipeList().remove(getIterator());
223   Parent = nullptr;
224 }
225 
226 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
227   assert(getParent() && "Recipe not in any VPBasicBlock");
228   return getParent()->getRecipeList().erase(getIterator());
229 }
230 
231 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
232   removeFromParent();
233   insertAfter(InsertPos);
234 }
235 
236 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
237                               iplist<VPRecipeBase>::iterator I) {
238   removeFromParent();
239   insertBefore(BB, I);
240 }
241 
242 FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
243   assert(OpType == OperationType::FPMathOp &&
244          "recipe doesn't have fast math flags");
245   FastMathFlags Res;
246   Res.setAllowReassoc(FMFs.AllowReassoc);
247   Res.setNoNaNs(FMFs.NoNaNs);
248   Res.setNoInfs(FMFs.NoInfs);
249   Res.setNoSignedZeros(FMFs.NoSignedZeros);
250   Res.setAllowReciprocal(FMFs.AllowReciprocal);
251   Res.setAllowContract(FMFs.AllowContract);
252   Res.setApproxFunc(FMFs.ApproxFunc);
253   return Res;
254 }
255 
256 VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
257                              VPValue *A, VPValue *B, DebugLoc DL,
258                              const Twine &Name)
259     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
260                           Pred, DL),
261       Opcode(Opcode), Name(Name.str()) {
262   assert(Opcode == Instruction::ICmp &&
263          "only ICmp predicates supported at the moment");
264 }
265 
266 VPInstruction::VPInstruction(unsigned Opcode,
267                              std::initializer_list<VPValue *> Operands,
268                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
269     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
270       Opcode(Opcode), Name(Name.str()) {
271   // Make sure the VPInstruction is a floating-point operation.
272   assert(isFPMathOp() && "this op can't take fast-math flags");
273 }
274 
275 Value *VPInstruction::generateInstruction(VPTransformState &State,
276                                           unsigned Part) {
277   IRBuilderBase &Builder = State.Builder;
278   Builder.SetCurrentDebugLocation(getDebugLoc());
279 
280   if (Instruction::isBinaryOp(getOpcode())) {
281     if (Part != 0 && vputils::onlyFirstPartUsed(this))
282       return State.get(this, 0);
283 
284     Value *A = State.get(getOperand(0), Part);
285     Value *B = State.get(getOperand(1), Part);
286     auto *Res =
287         Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
288     if (auto *I = dyn_cast<Instruction>(Res))
289       setFlags(I);
290     return Res;
291   }
292 
293   switch (getOpcode()) {
294   case VPInstruction::Not: {
295     Value *A = State.get(getOperand(0), Part);
296     return Builder.CreateNot(A, Name);
297   }
298   case Instruction::ICmp: {
299     Value *A = State.get(getOperand(0), Part);
300     Value *B = State.get(getOperand(1), Part);
301     return Builder.CreateCmp(getPredicate(), A, B, Name);
302   }
303   case Instruction::Select: {
304     Value *Cond = State.get(getOperand(0), Part);
305     Value *Op1 = State.get(getOperand(1), Part);
306     Value *Op2 = State.get(getOperand(2), Part);
307     return Builder.CreateSelect(Cond, Op1, Op2, Name);
308   }
309   case VPInstruction::ActiveLaneMask: {
310     // Get first lane of vector induction variable.
311     Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
312     // Get the original loop tripcount.
313     Value *ScalarTC = State.get(getOperand(1), VPIteration(Part, 0));
314 
315     auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
316     auto *PredTy = VectorType::get(Int1Ty, State.VF);
317     return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
318                                    {PredTy, ScalarTC->getType()},
319                                    {VIVElem0, ScalarTC}, nullptr, Name);
320   }
321   case VPInstruction::FirstOrderRecurrenceSplice: {
322     // Generate code to combine the previous and current values in vector v3.
323     //
324     //   vector.ph:
325     //     v_init = vector(..., ..., ..., a[-1])
326     //     br vector.body
327     //
328     //   vector.body
329     //     i = phi [0, vector.ph], [i+4, vector.body]
330     //     v1 = phi [v_init, vector.ph], [v2, vector.body]
331     //     v2 = a[i, i+1, i+2, i+3];
332     //     v3 = vector(v1(3), v2(0, 1, 2))
333 
334     // For the first part, use the recurrence phi (v1), otherwise v2.
335     auto *V1 = State.get(getOperand(0), 0);
336     Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
337     if (!PartMinus1->getType()->isVectorTy())
338       return PartMinus1;
339     Value *V2 = State.get(getOperand(1), Part);
340     return Builder.CreateVectorSplice(PartMinus1, V2, -1, Name);
341   }
342   case VPInstruction::CalculateTripCountMinusVF: {
343     Value *ScalarTC = State.get(getOperand(0), {0, 0});
344     Value *Step =
345         createStepForVF(Builder, ScalarTC->getType(), State.VF, State.UF);
346     Value *Sub = Builder.CreateSub(ScalarTC, Step);
347     Value *Cmp = Builder.CreateICmp(CmpInst::Predicate::ICMP_UGT, ScalarTC, Step);
348     Value *Zero = ConstantInt::get(ScalarTC->getType(), 0);
349     return Builder.CreateSelect(Cmp, Sub, Zero);
350   }
351   case VPInstruction::CanonicalIVIncrementForPart: {
352     auto *IV = State.get(getOperand(0), VPIteration(0, 0));
353     if (Part == 0)
354       return IV;
355 
356     // The canonical IV is incremented by the vectorization factor (num of SIMD
357     // elements) times the unroll part.
358     Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
359     return Builder.CreateAdd(IV, Step, Name, hasNoUnsignedWrap(),
360                              hasNoSignedWrap());
361   }
362   case VPInstruction::BranchOnCond: {
363     if (Part != 0)
364       return nullptr;
365 
366     Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
367     VPRegionBlock *ParentRegion = getParent()->getParent();
368     VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
369 
370     // Replace the temporary unreachable terminator with a new conditional
371     // branch, hooking it up to backward destination for exiting blocks now and
372     // to forward destination(s) later when they are created.
373     BranchInst *CondBr =
374         Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
375 
376     if (getParent()->isExiting())
377       CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
378 
379     CondBr->setSuccessor(0, nullptr);
380     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
381     return CondBr;
382   }
383   case VPInstruction::BranchOnCount: {
384     if (Part != 0)
385       return nullptr;
386     // First create the compare.
387     Value *IV = State.get(getOperand(0), Part);
388     Value *TC = State.get(getOperand(1), Part);
389     Value *Cond = Builder.CreateICmpEQ(IV, TC);
390 
391     // Now create the branch.
392     auto *Plan = getParent()->getPlan();
393     VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
394     VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
395 
396     // Replace the temporary unreachable terminator with a new conditional
397     // branch, hooking it up to backward destination (the header) now and to the
398     // forward destination (the exit/middle block) later when it is created.
399     // Note that CreateCondBr expects a valid BB as first argument, so we need
400     // to set it to nullptr later.
401     BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
402                                               State.CFG.VPBB2IRBB[Header]);
403     CondBr->setSuccessor(0, nullptr);
404     Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
405     return CondBr;
406   }
407   case VPInstruction::ComputeReductionResult: {
408     if (Part != 0)
409       return State.get(this, 0);
410 
411     // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary
412     // and will be removed by breaking up the recipe further.
413     auto *PhiR = cast<VPReductionPHIRecipe>(getOperand(0));
414     auto *OrigPhi = cast<PHINode>(PhiR->getUnderlyingValue());
415     // Get its reduction variable descriptor.
416     const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();
417 
418     RecurKind RK = RdxDesc.getRecurrenceKind();
419 
420     State.setDebugLocFrom(getDebugLoc());
421 
422     VPValue *LoopExitingDef = getOperand(1);
423     Type *PhiTy = OrigPhi->getType();
424     VectorParts RdxParts(State.UF);
425     for (unsigned Part = 0; Part < State.UF; ++Part)
426       RdxParts[Part] = State.get(LoopExitingDef, Part);
427 
428     // If the vector reduction can be performed in a smaller type, we truncate
429     // then extend the loop exit value to enable InstCombine to evaluate the
430     // entire expression in the smaller type.
431     // TODO: Handle this in truncateToMinBW.
432     if (State.VF.isVector() && PhiTy != RdxDesc.getRecurrenceType()) {
433       Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), State.VF);
434       for (unsigned Part = 0; Part < State.UF; ++Part)
435         RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
436     }
437     // Reduce all of the unrolled parts into a single vector.
438     Value *ReducedPartRdx = RdxParts[0];
439     unsigned Op = RecurrenceDescriptor::getOpcode(RK);
440 
441     if (PhiR->isOrdered()) {
442       ReducedPartRdx = RdxParts[State.UF - 1];
443     } else {
444       // Floating-point operations should have some FMF to enable the reduction.
445       IRBuilderBase::FastMathFlagGuard FMFG(Builder);
446       Builder.setFastMathFlags(RdxDesc.getFastMathFlags());
447       for (unsigned Part = 1; Part < State.UF; ++Part) {
448         Value *RdxPart = RdxParts[Part];
449         if (Op != Instruction::ICmp && Op != Instruction::FCmp)
450           ReducedPartRdx = Builder.CreateBinOp(
451               (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx");
452         else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
453           TrackingVH<Value> ReductionStartValue =
454               RdxDesc.getRecurrenceStartValue();
455           ReducedPartRdx = createAnyOfOp(Builder, ReductionStartValue, RK,
456                                          ReducedPartRdx, RdxPart);
457         } else
458           ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
459       }
460     }
461 
462     // Create the reduction after the loop. Note that inloop reductions create
463     // the target reduction in the loop using a Reduction recipe.
464     if (State.VF.isVector() && !PhiR->isInLoop()) {
465       ReducedPartRdx =
466           createTargetReduction(Builder, RdxDesc, ReducedPartRdx, OrigPhi);
467       // If the reduction can be performed in a smaller type, we need to extend
468       // the reduction to the wider type before we branch to the original loop.
469       if (PhiTy != RdxDesc.getRecurrenceType())
470         ReducedPartRdx = RdxDesc.isSigned()
471                              ? Builder.CreateSExt(ReducedPartRdx, PhiTy)
472                              : Builder.CreateZExt(ReducedPartRdx, PhiTy);
473     }
474 
475     // If there were stores of the reduction value to a uniform memory address
476     // inside the loop, create the final store here.
477     if (StoreInst *SI = RdxDesc.IntermediateStore) {
478       auto *NewSI = Builder.CreateAlignedStore(
479           ReducedPartRdx, SI->getPointerOperand(), SI->getAlign());
480       propagateMetadata(NewSI, SI);
481     }
482 
483     return ReducedPartRdx;
484   }
485   default:
486     llvm_unreachable("Unsupported opcode for instruction");
487   }
488 }
489 
490 #if !defined(NDEBUG)
491 bool VPInstruction::isFPMathOp() const {
492   // Inspired by FPMathOperator::classof. Notable differences are that we don't
493   // support Call, PHI and Select opcodes here yet.
494   return Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
495          Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
496          Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
497          Opcode == Instruction::FCmp || Opcode == Instruction::Select;
498 }
499 #endif
500 
501 void VPInstruction::execute(VPTransformState &State) {
502   assert(!State.Instance && "VPInstruction executing an Instance");
503   IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
504   assert((hasFastMathFlags() == isFPMathOp() ||
505           getOpcode() == Instruction::Select) &&
506          "Recipe not a FPMathOp but has fast-math flags?");
507   if (hasFastMathFlags())
508     State.Builder.setFastMathFlags(getFastMathFlags());
509   for (unsigned Part = 0; Part < State.UF; ++Part) {
510     Value *GeneratedValue = generateInstruction(State, Part);
511     if (!hasResult())
512       continue;
513     assert(GeneratedValue && "generateInstruction must produce a value");
514     State.set(this, GeneratedValue, Part);
515   }
516 }
517 
518 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
519 void VPInstruction::dump() const {
520   VPSlotTracker SlotTracker(getParent()->getPlan());
521   print(dbgs(), "", SlotTracker);
522 }
523 
524 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
525                           VPSlotTracker &SlotTracker) const {
526   O << Indent << "EMIT ";
527 
528   if (hasResult()) {
529     printAsOperand(O, SlotTracker);
530     O << " = ";
531   }
532 
533   switch (getOpcode()) {
534   case VPInstruction::Not:
535     O << "not";
536     break;
537   case VPInstruction::SLPLoad:
538     O << "combined load";
539     break;
540   case VPInstruction::SLPStore:
541     O << "combined store";
542     break;
543   case VPInstruction::ActiveLaneMask:
544     O << "active lane mask";
545     break;
546   case VPInstruction::FirstOrderRecurrenceSplice:
547     O << "first-order splice";
548     break;
549   case VPInstruction::BranchOnCond:
550     O << "branch-on-cond";
551     break;
552   case VPInstruction::CalculateTripCountMinusVF:
553     O << "TC > VF ? TC - VF : 0";
554     break;
555   case VPInstruction::CanonicalIVIncrementForPart:
556     O << "VF * Part +";
557     break;
558   case VPInstruction::BranchOnCount:
559     O << "branch-on-count";
560     break;
561   case VPInstruction::ComputeReductionResult:
562     O << "compute-reduction-result";
563     break;
564   default:
565     O << Instruction::getOpcodeName(getOpcode());
566   }
567 
568   printFlags(O);
569   printOperands(O, SlotTracker);
570 
571   if (auto DL = getDebugLoc()) {
572     O << ", !dbg ";
573     DL.print(O);
574   }
575 }
576 #endif
577 
578 void VPWidenCallRecipe::execute(VPTransformState &State) {
579   assert(State.VF.isVector() && "not widening");
580   auto &CI = *cast<CallInst>(getUnderlyingInstr());
581   assert(!isa<DbgInfoIntrinsic>(CI) &&
582          "DbgInfoIntrinsic should have been dropped during VPlan construction");
583   State.setDebugLocFrom(getDebugLoc());
584 
585   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
586   FunctionType *VFTy = nullptr;
587   if (Variant)
588     VFTy = Variant->getFunctionType();
589   for (unsigned Part = 0; Part < State.UF; ++Part) {
590     SmallVector<Type *, 2> TysForDecl;
591     // Add return type if intrinsic is overloaded on it.
592     if (UseIntrinsic &&
593         isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, -1))
594       TysForDecl.push_back(
595           VectorType::get(CI.getType()->getScalarType(), State.VF));
596     SmallVector<Value *, 4> Args;
597     for (const auto &I : enumerate(operands())) {
598       // Some intrinsics have a scalar argument - don't replace it with a
599       // vector.
600       Value *Arg;
601       if (UseIntrinsic &&
602           isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
603         Arg = State.get(I.value(), VPIteration(0, 0));
604       // Some vectorized function variants may also take a scalar argument,
605       // e.g. linear parameters for pointers. This needs to be the scalar value
606       // from the start of the respective part when interleaving.
607       else if (VFTy && !VFTy->getParamType(I.index())->isVectorTy())
608         Arg = State.get(I.value(), VPIteration(Part, 0));
609       else
610         Arg = State.get(I.value(), Part);
611       if (UseIntrinsic &&
612           isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
613         TysForDecl.push_back(Arg->getType());
614       Args.push_back(Arg);
615     }
616 
617     Function *VectorF;
618     if (UseIntrinsic) {
619       // Use vector version of the intrinsic.
620       Module *M = State.Builder.GetInsertBlock()->getModule();
621       VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
622       assert(VectorF && "Can't retrieve vector intrinsic.");
623     } else {
624 #ifndef NDEBUG
625       assert(Variant != nullptr && "Can't create vector function.");
626 #endif
627       VectorF = Variant;
628     }
629 
630     SmallVector<OperandBundleDef, 1> OpBundles;
631     CI.getOperandBundlesAsDefs(OpBundles);
632     CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
633 
634     if (isa<FPMathOperator>(V))
635       V->copyFastMathFlags(&CI);
636 
637     State.set(this, V, Part);
638     State.addMetadata(V, &CI);
639   }
640 }
641 
642 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
643 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
644                               VPSlotTracker &SlotTracker) const {
645   O << Indent << "WIDEN-CALL ";
646 
647   auto *CI = cast<CallInst>(getUnderlyingInstr());
648   if (CI->getType()->isVoidTy())
649     O << "void ";
650   else {
651     printAsOperand(O, SlotTracker);
652     O << " = ";
653   }
654 
655   O << "call @" << CI->getCalledFunction()->getName() << "(";
656   printOperands(O, SlotTracker);
657   O << ")";
658 
659   if (VectorIntrinsicID)
660     O << " (using vector intrinsic)";
661   else {
662     O << " (using library function";
663     if (Variant->hasName())
664       O << ": " << Variant->getName();
665     O << ")";
666   }
667 }
668 
669 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
670                                 VPSlotTracker &SlotTracker) const {
671   O << Indent << "WIDEN-SELECT ";
672   printAsOperand(O, SlotTracker);
673   O << " = select ";
674   getOperand(0)->printAsOperand(O, SlotTracker);
675   O << ", ";
676   getOperand(1)->printAsOperand(O, SlotTracker);
677   O << ", ";
678   getOperand(2)->printAsOperand(O, SlotTracker);
679   O << (isInvariantCond() ? " (condition is loop invariant)" : "");
680 }
681 #endif
682 
683 void VPWidenSelectRecipe::execute(VPTransformState &State) {
684   State.setDebugLocFrom(getDebugLoc());
685 
686   // The condition can be loop invariant but still defined inside the
687   // loop. This means that we can't just use the original 'cond' value.
688   // We have to take the 'vectorized' value and pick the first lane.
689   // Instcombine will make this a no-op.
690   auto *InvarCond =
691       isInvariantCond() ? State.get(getCond(), VPIteration(0, 0)) : nullptr;
692 
693   for (unsigned Part = 0; Part < State.UF; ++Part) {
694     Value *Cond = InvarCond ? InvarCond : State.get(getCond(), Part);
695     Value *Op0 = State.get(getOperand(1), Part);
696     Value *Op1 = State.get(getOperand(2), Part);
697     Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
698     State.set(this, Sel, Part);
699     State.addMetadata(Sel, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
700   }
701 }
702 
703 VPRecipeWithIRFlags::FastMathFlagsTy::FastMathFlagsTy(
704     const FastMathFlags &FMF) {
705   AllowReassoc = FMF.allowReassoc();
706   NoNaNs = FMF.noNaNs();
707   NoInfs = FMF.noInfs();
708   NoSignedZeros = FMF.noSignedZeros();
709   AllowReciprocal = FMF.allowReciprocal();
710   AllowContract = FMF.allowContract();
711   ApproxFunc = FMF.approxFunc();
712 }
713 
714 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
715 void VPRecipeWithIRFlags::printFlags(raw_ostream &O) const {
716   switch (OpType) {
717   case OperationType::Cmp:
718     O << " " << CmpInst::getPredicateName(getPredicate());
719     break;
720   case OperationType::DisjointOp:
721     if (DisjointFlags.IsDisjoint)
722       O << " disjoint";
723     break;
724   case OperationType::PossiblyExactOp:
725     if (ExactFlags.IsExact)
726       O << " exact";
727     break;
728   case OperationType::OverflowingBinOp:
729     if (WrapFlags.HasNUW)
730       O << " nuw";
731     if (WrapFlags.HasNSW)
732       O << " nsw";
733     break;
734   case OperationType::FPMathOp:
735     getFastMathFlags().print(O);
736     break;
737   case OperationType::GEPOp:
738     if (GEPFlags.IsInBounds)
739       O << " inbounds";
740     break;
741   case OperationType::NonNegOp:
742     if (NonNegFlags.NonNeg)
743       O << " nneg";
744     break;
745   case OperationType::Other:
746     break;
747   }
748   if (getNumOperands() > 0)
749     O << " ";
750 }
751 #endif
752 
753 void VPWidenRecipe::execute(VPTransformState &State) {
754   State.setDebugLocFrom(getDebugLoc());
755   auto &Builder = State.Builder;
756   switch (Opcode) {
757   case Instruction::Call:
758   case Instruction::Br:
759   case Instruction::PHI:
760   case Instruction::GetElementPtr:
761   case Instruction::Select:
762     llvm_unreachable("This instruction is handled by a different recipe.");
763   case Instruction::UDiv:
764   case Instruction::SDiv:
765   case Instruction::SRem:
766   case Instruction::URem:
767   case Instruction::Add:
768   case Instruction::FAdd:
769   case Instruction::Sub:
770   case Instruction::FSub:
771   case Instruction::FNeg:
772   case Instruction::Mul:
773   case Instruction::FMul:
774   case Instruction::FDiv:
775   case Instruction::FRem:
776   case Instruction::Shl:
777   case Instruction::LShr:
778   case Instruction::AShr:
779   case Instruction::And:
780   case Instruction::Or:
781   case Instruction::Xor: {
782     // Just widen unops and binops.
783     for (unsigned Part = 0; Part < State.UF; ++Part) {
784       SmallVector<Value *, 2> Ops;
785       for (VPValue *VPOp : operands())
786         Ops.push_back(State.get(VPOp, Part));
787 
788       Value *V = Builder.CreateNAryOp(Opcode, Ops);
789 
790       if (auto *VecOp = dyn_cast<Instruction>(V))
791         setFlags(VecOp);
792 
793       // Use this vector value for all users of the original instruction.
794       State.set(this, V, Part);
795       State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
796     }
797 
798     break;
799   }
800   case Instruction::Freeze: {
801     for (unsigned Part = 0; Part < State.UF; ++Part) {
802       Value *Op = State.get(getOperand(0), Part);
803 
804       Value *Freeze = Builder.CreateFreeze(Op);
805       State.set(this, Freeze, Part);
806     }
807     break;
808   }
809   case Instruction::ICmp:
810   case Instruction::FCmp: {
811     // Widen compares. Generate vector compares.
812     bool FCmp = Opcode == Instruction::FCmp;
813     for (unsigned Part = 0; Part < State.UF; ++Part) {
814       Value *A = State.get(getOperand(0), Part);
815       Value *B = State.get(getOperand(1), Part);
816       Value *C = nullptr;
817       if (FCmp) {
818         // Propagate fast math flags.
819         IRBuilder<>::FastMathFlagGuard FMFG(Builder);
820         if (auto *I = dyn_cast_or_null<Instruction>(getUnderlyingValue()))
821           Builder.setFastMathFlags(I->getFastMathFlags());
822         C = Builder.CreateFCmp(getPredicate(), A, B);
823       } else {
824         C = Builder.CreateICmp(getPredicate(), A, B);
825       }
826       State.set(this, C, Part);
827       State.addMetadata(C, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
828     }
829 
830     break;
831   }
832   default:
833     // This instruction is not vectorized by simple widening.
834     LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : "
835                       << Instruction::getOpcodeName(Opcode));
836     llvm_unreachable("Unhandled instruction!");
837   } // end of switch.
838 
839 #if !defined(NDEBUG)
840   // Verify that VPlan type inference results agree with the type of the
841   // generated values.
842   for (unsigned Part = 0; Part < State.UF; ++Part) {
843     assert(VectorType::get(State.TypeAnalysis.inferScalarType(this),
844                            State.VF) == State.get(this, Part)->getType() &&
845            "inferred type and type from generated instructions do not match");
846   }
847 #endif
848 }
849 
850 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
851 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
852                           VPSlotTracker &SlotTracker) const {
853   O << Indent << "WIDEN ";
854   printAsOperand(O, SlotTracker);
855   O << " = " << Instruction::getOpcodeName(Opcode);
856   printFlags(O);
857   printOperands(O, SlotTracker);
858 }
859 #endif
860 
861 void VPWidenCastRecipe::execute(VPTransformState &State) {
862   State.setDebugLocFrom(getDebugLoc());
863   auto &Builder = State.Builder;
864   /// Vectorize casts.
865   assert(State.VF.isVector() && "Not vectorizing?");
866   Type *DestTy = VectorType::get(getResultType(), State.VF);
867   VPValue *Op = getOperand(0);
868   for (unsigned Part = 0; Part < State.UF; ++Part) {
869     if (Part > 0 && Op->isLiveIn()) {
870       // FIXME: Remove once explicit unrolling is implemented using VPlan.
871       State.set(this, State.get(this, 0), Part);
872       continue;
873     }
874     Value *A = State.get(Op, Part);
875     Value *Cast = Builder.CreateCast(Instruction::CastOps(Opcode), A, DestTy);
876     State.set(this, Cast, Part);
877     State.addMetadata(Cast, cast_or_null<Instruction>(getUnderlyingValue()));
878   }
879 }
880 
881 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
882 void VPWidenCastRecipe::print(raw_ostream &O, const Twine &Indent,
883                               VPSlotTracker &SlotTracker) const {
884   O << Indent << "WIDEN-CAST ";
885   printAsOperand(O, SlotTracker);
886   O << " = " << Instruction::getOpcodeName(Opcode) << " ";
887   printFlags(O);
888   printOperands(O, SlotTracker);
889   O << " to " << *getResultType();
890 }
891 #endif
892 
893 /// This function adds
894 /// (StartIdx * Step, (StartIdx + 1) * Step, (StartIdx + 2) * Step, ...)
895 /// to each vector element of Val. The sequence starts at StartIndex.
896 /// \p Opcode is relevant for FP induction variable.
897 static Value *getStepVector(Value *Val, Value *StartIdx, Value *Step,
898                             Instruction::BinaryOps BinOp, ElementCount VF,
899                             IRBuilderBase &Builder) {
900   assert(VF.isVector() && "only vector VFs are supported");
901 
902   // Create and check the types.
903   auto *ValVTy = cast<VectorType>(Val->getType());
904   ElementCount VLen = ValVTy->getElementCount();
905 
906   Type *STy = Val->getType()->getScalarType();
907   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
908          "Induction Step must be an integer or FP");
909   assert(Step->getType() == STy && "Step has wrong type");
910 
911   SmallVector<Constant *, 8> Indices;
912 
913   // Create a vector of consecutive numbers from zero to VF.
914   VectorType *InitVecValVTy = ValVTy;
915   if (STy->isFloatingPointTy()) {
916     Type *InitVecValSTy =
917         IntegerType::get(STy->getContext(), STy->getScalarSizeInBits());
918     InitVecValVTy = VectorType::get(InitVecValSTy, VLen);
919   }
920   Value *InitVec = Builder.CreateStepVector(InitVecValVTy);
921 
922   // Splat the StartIdx
923   Value *StartIdxSplat = Builder.CreateVectorSplat(VLen, StartIdx);
924 
925   if (STy->isIntegerTy()) {
926     InitVec = Builder.CreateAdd(InitVec, StartIdxSplat);
927     Step = Builder.CreateVectorSplat(VLen, Step);
928     assert(Step->getType() == Val->getType() && "Invalid step vec");
929     // FIXME: The newly created binary instructions should contain nsw/nuw
930     // flags, which can be found from the original scalar operations.
931     Step = Builder.CreateMul(InitVec, Step);
932     return Builder.CreateAdd(Val, Step, "induction");
933   }
934 
935   // Floating point induction.
936   assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
937          "Binary Opcode should be specified for FP induction");
938   InitVec = Builder.CreateUIToFP(InitVec, ValVTy);
939   InitVec = Builder.CreateFAdd(InitVec, StartIdxSplat);
940 
941   Step = Builder.CreateVectorSplat(VLen, Step);
942   Value *MulOp = Builder.CreateFMul(InitVec, Step);
943   return Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
944 }
945 
946 /// A helper function that returns an integer or floating-point constant with
947 /// value C.
948 static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
949   return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
950                            : ConstantFP::get(Ty, C);
951 }
952 
953 static Value *getRuntimeVFAsFloat(IRBuilderBase &B, Type *FTy,
954                                   ElementCount VF) {
955   assert(FTy->isFloatingPointTy() && "Expected floating point type!");
956   Type *IntTy = IntegerType::get(FTy->getContext(), FTy->getScalarSizeInBits());
957   Value *RuntimeVF = getRuntimeVF(B, IntTy, VF);
958   return B.CreateUIToFP(RuntimeVF, FTy);
959 }
960 
961 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
962   assert(!State.Instance && "Int or FP induction being replicated.");
963 
964   Value *Start = getStartValue()->getLiveInIRValue();
965   const InductionDescriptor &ID = getInductionDescriptor();
966   TruncInst *Trunc = getTruncInst();
967   IRBuilderBase &Builder = State.Builder;
968   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
969   assert(State.VF.isVector() && "must have vector VF");
970 
971   // The value from the original loop to which we are mapping the new induction
972   // variable.
973   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
974 
975   // Fast-math-flags propagate from the original induction instruction.
976   IRBuilder<>::FastMathFlagGuard FMFG(Builder);
977   if (ID.getInductionBinOp() && isa<FPMathOperator>(ID.getInductionBinOp()))
978     Builder.setFastMathFlags(ID.getInductionBinOp()->getFastMathFlags());
979 
980   // Now do the actual transformations, and start with fetching the step value.
981   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
982 
983   assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
984          "Expected either an induction phi-node or a truncate of it!");
985 
986   // Construct the initial value of the vector IV in the vector loop preheader
987   auto CurrIP = Builder.saveIP();
988   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
989   Builder.SetInsertPoint(VectorPH->getTerminator());
990   if (isa<TruncInst>(EntryVal)) {
991     assert(Start->getType()->isIntegerTy() &&
992            "Truncation requires an integer type");
993     auto *TruncType = cast<IntegerType>(EntryVal->getType());
994     Step = Builder.CreateTrunc(Step, TruncType);
995     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
996   }
997 
998   Value *Zero = getSignedIntOrFpConstant(Start->getType(), 0);
999   Value *SplatStart = Builder.CreateVectorSplat(State.VF, Start);
1000   Value *SteppedStart = getStepVector(
1001       SplatStart, Zero, Step, ID.getInductionOpcode(), State.VF, State.Builder);
1002 
1003   // We create vector phi nodes for both integer and floating-point induction
1004   // variables. Here, we determine the kind of arithmetic we will perform.
1005   Instruction::BinaryOps AddOp;
1006   Instruction::BinaryOps MulOp;
1007   if (Step->getType()->isIntegerTy()) {
1008     AddOp = Instruction::Add;
1009     MulOp = Instruction::Mul;
1010   } else {
1011     AddOp = ID.getInductionOpcode();
1012     MulOp = Instruction::FMul;
1013   }
1014 
1015   // Multiply the vectorization factor by the step using integer or
1016   // floating-point arithmetic as appropriate.
1017   Type *StepType = Step->getType();
1018   Value *RuntimeVF;
1019   if (Step->getType()->isFloatingPointTy())
1020     RuntimeVF = getRuntimeVFAsFloat(Builder, StepType, State.VF);
1021   else
1022     RuntimeVF = getRuntimeVF(Builder, StepType, State.VF);
1023   Value *Mul = Builder.CreateBinOp(MulOp, Step, RuntimeVF);
1024 
1025   // Create a vector splat to use in the induction update.
1026   //
1027   // FIXME: If the step is non-constant, we create the vector splat with
1028   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
1029   //        handle a constant vector splat.
1030   Value *SplatVF = isa<Constant>(Mul)
1031                        ? ConstantVector::getSplat(State.VF, cast<Constant>(Mul))
1032                        : Builder.CreateVectorSplat(State.VF, Mul);
1033   Builder.restoreIP(CurrIP);
1034 
1035   // We may need to add the step a number of times, depending on the unroll
1036   // factor. The last of those goes into the PHI.
1037   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind");
1038   VecInd->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1039   VecInd->setDebugLoc(EntryVal->getDebugLoc());
1040   Instruction *LastInduction = VecInd;
1041   for (unsigned Part = 0; Part < State.UF; ++Part) {
1042     State.set(this, LastInduction, Part);
1043 
1044     if (isa<TruncInst>(EntryVal))
1045       State.addMetadata(LastInduction, EntryVal);
1046 
1047     LastInduction = cast<Instruction>(
1048         Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add"));
1049     LastInduction->setDebugLoc(EntryVal->getDebugLoc());
1050   }
1051 
1052   LastInduction->setName("vec.ind.next");
1053   VecInd->addIncoming(SteppedStart, VectorPH);
1054   // Add induction update using an incorrect block temporarily. The phi node
1055   // will be fixed after VPlan execution. Note that at this point the latch
1056   // block cannot be used, as it does not exist yet.
1057   // TODO: Model increment value in VPlan, by turning the recipe into a
1058   // multi-def and a subclass of VPHeaderPHIRecipe.
1059   VecInd->addIncoming(LastInduction, VectorPH);
1060 }
1061 
1062 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1063 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1064                                           VPSlotTracker &SlotTracker) const {
1065   O << Indent << "WIDEN-INDUCTION";
1066   if (getTruncInst()) {
1067     O << "\\l\"";
1068     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
1069     O << " +\n" << Indent << "\"  ";
1070     getVPValue(0)->printAsOperand(O, SlotTracker);
1071   } else
1072     O << " " << VPlanIngredient(IV);
1073 
1074   O << ", ";
1075   getStepValue()->printAsOperand(O, SlotTracker);
1076 }
1077 #endif
1078 
1079 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
1080   // The step may be defined by a recipe in the preheader (e.g. if it requires
1081   // SCEV expansion), but for the canonical induction the step is required to be
1082   // 1, which is represented as live-in.
1083   if (getStepValue()->getDefiningRecipe())
1084     return false;
1085   auto *StepC = dyn_cast<ConstantInt>(getStepValue()->getLiveInIRValue());
1086   auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
1087   return StartC && StartC->isZero() && StepC && StepC->isOne();
1088 }
1089 
1090 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1091 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
1092                               VPSlotTracker &SlotTracker) const {
1093   O << Indent;
1094   printAsOperand(O, SlotTracker);
1095   O << Indent << "= DERIVED-IV ";
1096   getStartValue()->printAsOperand(O, SlotTracker);
1097   O << " + ";
1098   getCanonicalIV()->printAsOperand(O, SlotTracker);
1099   O << " * ";
1100   getStepValue()->printAsOperand(O, SlotTracker);
1101 
1102   if (TruncResultTy)
1103     O << " (truncated to " << *TruncResultTy << ")";
1104 }
1105 #endif
1106 
1107 void VPScalarIVStepsRecipe::execute(VPTransformState &State) {
1108   // Fast-math-flags propagate from the original induction instruction.
1109   IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
1110   if (hasFastMathFlags())
1111     State.Builder.setFastMathFlags(getFastMathFlags());
1112 
1113   /// Compute scalar induction steps. \p ScalarIV is the scalar induction
1114   /// variable on which to base the steps, \p Step is the size of the step.
1115 
1116   Value *BaseIV = State.get(getOperand(0), VPIteration(0, 0));
1117   Value *Step = State.get(getStepValue(), VPIteration(0, 0));
1118   IRBuilderBase &Builder = State.Builder;
1119 
1120   // Ensure step has the same type as that of scalar IV.
1121   Type *BaseIVTy = BaseIV->getType()->getScalarType();
1122   if (BaseIVTy != Step->getType()) {
1123     // TODO: Also use VPDerivedIVRecipe when only the step needs truncating, to
1124     // avoid separate truncate here.
1125     assert(Step->getType()->isIntegerTy() &&
1126            "Truncation requires an integer step");
1127     Step = State.Builder.CreateTrunc(Step, BaseIVTy);
1128   }
1129 
1130   // We build scalar steps for both integer and floating-point induction
1131   // variables. Here, we determine the kind of arithmetic we will perform.
1132   Instruction::BinaryOps AddOp;
1133   Instruction::BinaryOps MulOp;
1134   if (BaseIVTy->isIntegerTy()) {
1135     AddOp = Instruction::Add;
1136     MulOp = Instruction::Mul;
1137   } else {
1138     AddOp = InductionOpcode;
1139     MulOp = Instruction::FMul;
1140   }
1141 
1142   // Determine the number of scalars we need to generate for each unroll
1143   // iteration.
1144   bool FirstLaneOnly = vputils::onlyFirstLaneUsed(this);
1145   // Compute the scalar steps and save the results in State.
1146   Type *IntStepTy =
1147       IntegerType::get(BaseIVTy->getContext(), BaseIVTy->getScalarSizeInBits());
1148   Type *VecIVTy = nullptr;
1149   Value *UnitStepVec = nullptr, *SplatStep = nullptr, *SplatIV = nullptr;
1150   if (!FirstLaneOnly && State.VF.isScalable()) {
1151     VecIVTy = VectorType::get(BaseIVTy, State.VF);
1152     UnitStepVec =
1153         Builder.CreateStepVector(VectorType::get(IntStepTy, State.VF));
1154     SplatStep = Builder.CreateVectorSplat(State.VF, Step);
1155     SplatIV = Builder.CreateVectorSplat(State.VF, BaseIV);
1156   }
1157 
1158   unsigned StartPart = 0;
1159   unsigned EndPart = State.UF;
1160   unsigned StartLane = 0;
1161   unsigned EndLane = FirstLaneOnly ? 1 : State.VF.getKnownMinValue();
1162   if (State.Instance) {
1163     StartPart = State.Instance->Part;
1164     EndPart = StartPart + 1;
1165     StartLane = State.Instance->Lane.getKnownLane();
1166     EndLane = StartLane + 1;
1167   }
1168   for (unsigned Part = StartPart; Part < EndPart; ++Part) {
1169     Value *StartIdx0 = createStepForVF(Builder, IntStepTy, State.VF, Part);
1170 
1171     if (!FirstLaneOnly && State.VF.isScalable()) {
1172       auto *SplatStartIdx = Builder.CreateVectorSplat(State.VF, StartIdx0);
1173       auto *InitVec = Builder.CreateAdd(SplatStartIdx, UnitStepVec);
1174       if (BaseIVTy->isFloatingPointTy())
1175         InitVec = Builder.CreateSIToFP(InitVec, VecIVTy);
1176       auto *Mul = Builder.CreateBinOp(MulOp, InitVec, SplatStep);
1177       auto *Add = Builder.CreateBinOp(AddOp, SplatIV, Mul);
1178       State.set(this, Add, Part);
1179       // It's useful to record the lane values too for the known minimum number
1180       // of elements so we do those below. This improves the code quality when
1181       // trying to extract the first element, for example.
1182     }
1183 
1184     if (BaseIVTy->isFloatingPointTy())
1185       StartIdx0 = Builder.CreateSIToFP(StartIdx0, BaseIVTy);
1186 
1187     for (unsigned Lane = StartLane; Lane < EndLane; ++Lane) {
1188       Value *StartIdx = Builder.CreateBinOp(
1189           AddOp, StartIdx0, getSignedIntOrFpConstant(BaseIVTy, Lane));
1190       // The step returned by `createStepForVF` is a runtime-evaluated value
1191       // when VF is scalable. Otherwise, it should be folded into a Constant.
1192       assert((State.VF.isScalable() || isa<Constant>(StartIdx)) &&
1193              "Expected StartIdx to be folded to a constant when VF is not "
1194              "scalable");
1195       auto *Mul = Builder.CreateBinOp(MulOp, StartIdx, Step);
1196       auto *Add = Builder.CreateBinOp(AddOp, BaseIV, Mul);
1197       State.set(this, Add, VPIteration(Part, Lane));
1198     }
1199   }
1200 }
1201 
1202 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1203 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
1204                                   VPSlotTracker &SlotTracker) const {
1205   O << Indent;
1206   printAsOperand(O, SlotTracker);
1207   O << " = SCALAR-STEPS ";
1208   printOperands(O, SlotTracker);
1209 }
1210 #endif
1211 
1212 void VPWidenGEPRecipe::execute(VPTransformState &State) {
1213   assert(State.VF.isVector() && "not widening");
1214   auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1215   // Construct a vector GEP by widening the operands of the scalar GEP as
1216   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
1217   // results in a vector of pointers when at least one operand of the GEP
1218   // is vector-typed. Thus, to keep the representation compact, we only use
1219   // vector-typed operands for loop-varying values.
1220 
1221   if (areAllOperandsInvariant()) {
1222     // If we are vectorizing, but the GEP has only loop-invariant operands,
1223     // the GEP we build (by only using vector-typed operands for
1224     // loop-varying values) would be a scalar pointer. Thus, to ensure we
1225     // produce a vector of pointers, we need to either arbitrarily pick an
1226     // operand to broadcast, or broadcast a clone of the original GEP.
1227     // Here, we broadcast a clone of the original.
1228     //
1229     // TODO: If at some point we decide to scalarize instructions having
1230     //       loop-invariant operands, this special case will no longer be
1231     //       required. We would add the scalarization decision to
1232     //       collectLoopScalars() and teach getVectorValue() to broadcast
1233     //       the lane-zero scalar value.
1234     SmallVector<Value *> Ops;
1235     for (unsigned I = 0, E = getNumOperands(); I != E; I++)
1236       Ops.push_back(State.get(getOperand(I), VPIteration(0, 0)));
1237 
1238     auto *NewGEP =
1239         State.Builder.CreateGEP(GEP->getSourceElementType(), Ops[0],
1240                                 ArrayRef(Ops).drop_front(), "", isInBounds());
1241     for (unsigned Part = 0; Part < State.UF; ++Part) {
1242       Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, NewGEP);
1243       State.set(this, EntryPart, Part);
1244       State.addMetadata(EntryPart, GEP);
1245     }
1246   } else {
1247     // If the GEP has at least one loop-varying operand, we are sure to
1248     // produce a vector of pointers. But if we are only unrolling, we want
1249     // to produce a scalar GEP for each unroll part. Thus, the GEP we
1250     // produce with the code below will be scalar (if VF == 1) or vector
1251     // (otherwise). Note that for the unroll-only case, we still maintain
1252     // values in the vector mapping with initVector, as we do for other
1253     // instructions.
1254     for (unsigned Part = 0; Part < State.UF; ++Part) {
1255       // The pointer operand of the new GEP. If it's loop-invariant, we
1256       // won't broadcast it.
1257       auto *Ptr = isPointerLoopInvariant()
1258                       ? State.get(getOperand(0), VPIteration(0, 0))
1259                       : State.get(getOperand(0), Part);
1260 
1261       // Collect all the indices for the new GEP. If any index is
1262       // loop-invariant, we won't broadcast it.
1263       SmallVector<Value *, 4> Indices;
1264       for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
1265         VPValue *Operand = getOperand(I);
1266         if (isIndexLoopInvariant(I - 1))
1267           Indices.push_back(State.get(Operand, VPIteration(0, 0)));
1268         else
1269           Indices.push_back(State.get(Operand, Part));
1270       }
1271 
1272       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
1273       // but it should be a vector, otherwise.
1274       auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
1275                                              Indices, "", isInBounds());
1276       assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
1277              "NewGEP is not a pointer vector");
1278       State.set(this, NewGEP, Part);
1279       State.addMetadata(NewGEP, GEP);
1280     }
1281   }
1282 }
1283 
1284 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1285 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
1286                              VPSlotTracker &SlotTracker) const {
1287   O << Indent << "WIDEN-GEP ";
1288   O << (isPointerLoopInvariant() ? "Inv" : "Var");
1289   for (size_t I = 0; I < getNumOperands() - 1; ++I)
1290     O << "[" << (isIndexLoopInvariant(I) ? "Inv" : "Var") << "]";
1291 
1292   O << " ";
1293   printAsOperand(O, SlotTracker);
1294   O << " = getelementptr";
1295   printFlags(O);
1296   printOperands(O, SlotTracker);
1297 }
1298 #endif
1299 
1300 void VPVectorPointerRecipe ::execute(VPTransformState &State) {
1301   auto &Builder = State.Builder;
1302   State.setDebugLocFrom(getDebugLoc());
1303   for (unsigned Part = 0; Part < State.UF; ++Part) {
1304     // Calculate the pointer for the specific unroll-part.
1305     Value *PartPtr = nullptr;
1306     // Use i32 for the gep index type when the value is constant,
1307     // or query DataLayout for a more suitable index type otherwise.
1308     const DataLayout &DL =
1309         Builder.GetInsertBlock()->getModule()->getDataLayout();
1310     Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0)
1311                         ? DL.getIndexType(IndexedTy->getPointerTo())
1312                         : Builder.getInt32Ty();
1313     Value *Ptr = State.get(getOperand(0), VPIteration(0, 0));
1314     bool InBounds = isInBounds();
1315     if (IsReverse) {
1316       // If the address is consecutive but reversed, then the
1317       // wide store needs to start at the last vector element.
1318       // RunTimeVF =  VScale * VF.getKnownMinValue()
1319       // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue()
1320       Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF);
1321       // NumElt = -Part * RunTimeVF
1322       Value *NumElt = Builder.CreateMul(
1323           ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF);
1324       // LastLane = 1 - RunTimeVF
1325       Value *LastLane =
1326           Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF);
1327       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, NumElt, "", InBounds);
1328       PartPtr = Builder.CreateGEP(IndexedTy, PartPtr, LastLane, "", InBounds);
1329     } else {
1330       Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part);
1331       PartPtr = Builder.CreateGEP(IndexedTy, Ptr, Increment, "", InBounds);
1332     }
1333 
1334     State.set(this, PartPtr, Part);
1335   }
1336 }
1337 
1338 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1339 void VPVectorPointerRecipe::print(raw_ostream &O, const Twine &Indent,
1340                                   VPSlotTracker &SlotTracker) const {
1341   O << Indent;
1342   printAsOperand(O, SlotTracker);
1343   O << " = vector-pointer ";
1344   if (IsReverse)
1345     O << "(reverse) ";
1346 
1347   printOperands(O, SlotTracker);
1348 }
1349 #endif
1350 
1351 void VPBlendRecipe::execute(VPTransformState &State) {
1352   State.setDebugLocFrom(getDebugLoc());
1353   // We know that all PHIs in non-header blocks are converted into
1354   // selects, so we don't have to worry about the insertion order and we
1355   // can just use the builder.
1356   // At this point we generate the predication tree. There may be
1357   // duplications since this is a simple recursive scan, but future
1358   // optimizations will clean it up.
1359 
1360   unsigned NumIncoming = getNumIncomingValues();
1361 
1362   // Generate a sequence of selects of the form:
1363   // SELECT(Mask3, In3,
1364   //        SELECT(Mask2, In2,
1365   //               SELECT(Mask1, In1,
1366   //                      In0)))
1367   // Note that Mask0 is never used: lanes for which no path reaches this phi and
1368   // are essentially undef are taken from In0.
1369  VectorParts Entry(State.UF);
1370   for (unsigned In = 0; In < NumIncoming; ++In) {
1371     for (unsigned Part = 0; Part < State.UF; ++Part) {
1372       // We might have single edge PHIs (blocks) - use an identity
1373       // 'select' for the first PHI operand.
1374       Value *In0 = State.get(getIncomingValue(In), Part);
1375       if (In == 0)
1376         Entry[Part] = In0; // Initialize with the first incoming value.
1377       else {
1378         // Select between the current value and the previous incoming edge
1379         // based on the incoming mask.
1380         Value *Cond = State.get(getMask(In), Part);
1381         Entry[Part] =
1382             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
1383       }
1384     }
1385   }
1386   for (unsigned Part = 0; Part < State.UF; ++Part)
1387     State.set(this, Entry[Part], Part);
1388 }
1389 
1390 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1391 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
1392                           VPSlotTracker &SlotTracker) const {
1393   O << Indent << "BLEND ";
1394   printAsOperand(O, SlotTracker);
1395   O << " =";
1396   if (getNumIncomingValues() == 1) {
1397     // Not a User of any mask: not really blending, this is a
1398     // single-predecessor phi.
1399     O << " ";
1400     getIncomingValue(0)->printAsOperand(O, SlotTracker);
1401   } else {
1402     for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
1403       O << " ";
1404       getIncomingValue(I)->printAsOperand(O, SlotTracker);
1405       O << "/";
1406       getMask(I)->printAsOperand(O, SlotTracker);
1407     }
1408   }
1409 }
1410 
1411 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
1412                               VPSlotTracker &SlotTracker) const {
1413   O << Indent << "REDUCE ";
1414   printAsOperand(O, SlotTracker);
1415   O << " = ";
1416   getChainOp()->printAsOperand(O, SlotTracker);
1417   O << " +";
1418   if (isa<FPMathOperator>(getUnderlyingInstr()))
1419     O << getUnderlyingInstr()->getFastMathFlags();
1420   O << " reduce." << Instruction::getOpcodeName(RdxDesc.getOpcode()) << " (";
1421   getVecOp()->printAsOperand(O, SlotTracker);
1422   if (getCondOp()) {
1423     O << ", ";
1424     getCondOp()->printAsOperand(O, SlotTracker);
1425   }
1426   O << ")";
1427   if (RdxDesc.IntermediateStore)
1428     O << " (with final reduction value stored in invariant address sank "
1429          "outside of loop)";
1430 }
1431 #endif
1432 
1433 bool VPReplicateRecipe::shouldPack() const {
1434   // Find if the recipe is used by a widened recipe via an intervening
1435   // VPPredInstPHIRecipe. In this case, also pack the scalar values in a vector.
1436   return any_of(users(), [](const VPUser *U) {
1437     if (auto *PredR = dyn_cast<VPPredInstPHIRecipe>(U))
1438       return any_of(PredR->users(), [PredR](const VPUser *U) {
1439         return !U->usesScalars(PredR);
1440       });
1441     return false;
1442   });
1443 }
1444 
1445 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1446 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
1447                               VPSlotTracker &SlotTracker) const {
1448   O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
1449 
1450   if (!getUnderlyingInstr()->getType()->isVoidTy()) {
1451     printAsOperand(O, SlotTracker);
1452     O << " = ";
1453   }
1454   if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
1455     O << "call";
1456     printFlags(O);
1457     O << "@" << CB->getCalledFunction()->getName() << "(";
1458     interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
1459                     O, [&O, &SlotTracker](VPValue *Op) {
1460                       Op->printAsOperand(O, SlotTracker);
1461                     });
1462     O << ")";
1463   } else {
1464     O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode());
1465     printFlags(O);
1466     printOperands(O, SlotTracker);
1467   }
1468 
1469   if (shouldPack())
1470     O << " (S->V)";
1471 }
1472 #endif
1473 
1474 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
1475   assert(State.Instance && "Branch on Mask works only on single instance.");
1476 
1477   unsigned Part = State.Instance->Part;
1478   unsigned Lane = State.Instance->Lane.getKnownLane();
1479 
1480   Value *ConditionBit = nullptr;
1481   VPValue *BlockInMask = getMask();
1482   if (BlockInMask) {
1483     ConditionBit = State.get(BlockInMask, Part);
1484     if (ConditionBit->getType()->isVectorTy())
1485       ConditionBit = State.Builder.CreateExtractElement(
1486           ConditionBit, State.Builder.getInt32(Lane));
1487   } else // Block in mask is all-one.
1488     ConditionBit = State.Builder.getTrue();
1489 
1490   // Replace the temporary unreachable terminator with a new conditional branch,
1491   // whose two destinations will be set later when they are created.
1492   auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
1493   assert(isa<UnreachableInst>(CurrentTerminator) &&
1494          "Expected to replace unreachable terminator with conditional branch.");
1495   auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
1496   CondBr->setSuccessor(0, nullptr);
1497   ReplaceInstWithInst(CurrentTerminator, CondBr);
1498 }
1499 
1500 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
1501   assert(State.Instance && "Predicated instruction PHI works per instance.");
1502   Instruction *ScalarPredInst =
1503       cast<Instruction>(State.get(getOperand(0), *State.Instance));
1504   BasicBlock *PredicatedBB = ScalarPredInst->getParent();
1505   BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
1506   assert(PredicatingBB && "Predicated block has no single predecessor.");
1507   assert(isa<VPReplicateRecipe>(getOperand(0)) &&
1508          "operand must be VPReplicateRecipe");
1509 
1510   // By current pack/unpack logic we need to generate only a single phi node: if
1511   // a vector value for the predicated instruction exists at this point it means
1512   // the instruction has vector users only, and a phi for the vector value is
1513   // needed. In this case the recipe of the predicated instruction is marked to
1514   // also do that packing, thereby "hoisting" the insert-element sequence.
1515   // Otherwise, a phi node for the scalar value is needed.
1516   unsigned Part = State.Instance->Part;
1517   if (State.hasVectorValue(getOperand(0), Part)) {
1518     Value *VectorValue = State.get(getOperand(0), Part);
1519     InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
1520     PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
1521     VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
1522     VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
1523     if (State.hasVectorValue(this, Part))
1524       State.reset(this, VPhi, Part);
1525     else
1526       State.set(this, VPhi, Part);
1527     // NOTE: Currently we need to update the value of the operand, so the next
1528     // predicated iteration inserts its generated value in the correct vector.
1529     State.reset(getOperand(0), VPhi, Part);
1530   } else {
1531     Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
1532     PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
1533     Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1534                      PredicatingBB);
1535     Phi->addIncoming(ScalarPredInst, PredicatedBB);
1536     if (State.hasScalarValue(this, *State.Instance))
1537       State.reset(this, Phi, *State.Instance);
1538     else
1539       State.set(this, Phi, *State.Instance);
1540     // NOTE: Currently we need to update the value of the operand, so the next
1541     // predicated iteration inserts its generated value in the correct vector.
1542     State.reset(getOperand(0), Phi, *State.Instance);
1543   }
1544 }
1545 
1546 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1547 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1548                                 VPSlotTracker &SlotTracker) const {
1549   O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1550   printAsOperand(O, SlotTracker);
1551   O << " = ";
1552   printOperands(O, SlotTracker);
1553 }
1554 
1555 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1556                                            VPSlotTracker &SlotTracker) const {
1557   O << Indent << "WIDEN ";
1558 
1559   if (!isStore()) {
1560     getVPSingleValue()->printAsOperand(O, SlotTracker);
1561     O << " = ";
1562   }
1563   O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1564 
1565   printOperands(O, SlotTracker);
1566 }
1567 #endif
1568 
1569 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1570   Value *Start = getStartValue()->getLiveInIRValue();
1571   PHINode *EntryPart = PHINode::Create(Start->getType(), 2, "index");
1572   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1573 
1574   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1575   EntryPart->addIncoming(Start, VectorPH);
1576   EntryPart->setDebugLoc(getDebugLoc());
1577   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1578     State.set(this, EntryPart, Part);
1579 }
1580 
1581 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1582 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1583                                    VPSlotTracker &SlotTracker) const {
1584   O << Indent << "EMIT ";
1585   printAsOperand(O, SlotTracker);
1586   O << " = CANONICAL-INDUCTION ";
1587   printOperands(O, SlotTracker);
1588 }
1589 #endif
1590 
1591 bool VPCanonicalIVPHIRecipe::isCanonical(
1592     InductionDescriptor::InductionKind Kind, VPValue *Start, VPValue *Step,
1593     Type *Ty) const {
1594   // The types must match and it must be an integer induction.
1595   if (Ty != getScalarType() || Kind != InductionDescriptor::IK_IntInduction)
1596     return false;
1597   // Start must match the start value of this canonical induction.
1598   if (Start != getStartValue())
1599     return false;
1600 
1601   // If the step is defined by a recipe, it is not a ConstantInt.
1602   if (Step->getDefiningRecipe())
1603     return false;
1604 
1605   ConstantInt *StepC = dyn_cast<ConstantInt>(Step->getLiveInIRValue());
1606   return StepC && StepC->isOne();
1607 }
1608 
1609 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1610   return IsScalarAfterVectorization &&
1611          (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1612 }
1613 
1614 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1615 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1616                                           VPSlotTracker &SlotTracker) const {
1617   O << Indent << "EMIT ";
1618   printAsOperand(O, SlotTracker);
1619   O << " = WIDEN-POINTER-INDUCTION ";
1620   getStartValue()->printAsOperand(O, SlotTracker);
1621   O << ", " << *IndDesc.getStep();
1622 }
1623 #endif
1624 
1625 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1626   assert(!State.Instance && "cannot be used in per-lane");
1627   const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1628   SCEVExpander Exp(SE, DL, "induction");
1629 
1630   Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1631                                  &*State.Builder.GetInsertPoint());
1632   assert(!State.ExpandedSCEVs.contains(Expr) &&
1633          "Same SCEV expanded multiple times");
1634   State.ExpandedSCEVs[Expr] = Res;
1635   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1636     State.set(this, Res, {Part, 0});
1637 }
1638 
1639 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1640 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1641                                VPSlotTracker &SlotTracker) const {
1642   O << Indent << "EMIT ";
1643   getVPSingleValue()->printAsOperand(O, SlotTracker);
1644   O << " = EXPAND SCEV " << *Expr;
1645 }
1646 #endif
1647 
1648 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1649   Value *CanonicalIV = State.get(getOperand(0), 0);
1650   Type *STy = CanonicalIV->getType();
1651   IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1652   ElementCount VF = State.VF;
1653   Value *VStart = VF.isScalar()
1654                       ? CanonicalIV
1655                       : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1656   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1657     Value *VStep = createStepForVF(Builder, STy, VF, Part);
1658     if (VF.isVector()) {
1659       VStep = Builder.CreateVectorSplat(VF, VStep);
1660       VStep =
1661           Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1662     }
1663     Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1664     State.set(this, CanonicalVectorIV, Part);
1665   }
1666 }
1667 
1668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1669 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1670                                      VPSlotTracker &SlotTracker) const {
1671   O << Indent << "EMIT ";
1672   printAsOperand(O, SlotTracker);
1673   O << " = WIDEN-CANONICAL-INDUCTION ";
1674   printOperands(O, SlotTracker);
1675 }
1676 #endif
1677 
1678 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1679   auto &Builder = State.Builder;
1680   // Create a vector from the initial value.
1681   auto *VectorInit = getStartValue()->getLiveInIRValue();
1682 
1683   Type *VecTy = State.VF.isScalar()
1684                     ? VectorInit->getType()
1685                     : VectorType::get(VectorInit->getType(), State.VF);
1686 
1687   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1688   if (State.VF.isVector()) {
1689     auto *IdxTy = Builder.getInt32Ty();
1690     auto *One = ConstantInt::get(IdxTy, 1);
1691     IRBuilder<>::InsertPointGuard Guard(Builder);
1692     Builder.SetInsertPoint(VectorPH->getTerminator());
1693     auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1694     auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1695     VectorInit = Builder.CreateInsertElement(
1696         PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1697   }
1698 
1699   // Create a phi node for the new recurrence.
1700   PHINode *EntryPart = PHINode::Create(VecTy, 2, "vector.recur");
1701   EntryPart->insertBefore(State.CFG.PrevBB->getFirstInsertionPt());
1702   EntryPart->addIncoming(VectorInit, VectorPH);
1703   State.set(this, EntryPart, 0);
1704 }
1705 
1706 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1707 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1708                                             VPSlotTracker &SlotTracker) const {
1709   O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1710   printAsOperand(O, SlotTracker);
1711   O << " = phi ";
1712   printOperands(O, SlotTracker);
1713 }
1714 #endif
1715 
1716 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1717   auto &Builder = State.Builder;
1718 
1719   // Reductions do not have to start at zero. They can start with
1720   // any loop invariant values.
1721   VPValue *StartVPV = getStartValue();
1722   Value *StartV = StartVPV->getLiveInIRValue();
1723 
1724   // In order to support recurrences we need to be able to vectorize Phi nodes.
1725   // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1726   // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1727   // this value when we vectorize all of the instructions that use the PHI.
1728   bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1729   Type *VecTy = ScalarPHI ? StartV->getType()
1730                           : VectorType::get(StartV->getType(), State.VF);
1731 
1732   BasicBlock *HeaderBB = State.CFG.PrevBB;
1733   assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1734          "recipe must be in the vector loop header");
1735   unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1736   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1737     Instruction *EntryPart = PHINode::Create(VecTy, 2, "vec.phi");
1738     EntryPart->insertBefore(HeaderBB->getFirstInsertionPt());
1739     State.set(this, EntryPart, Part);
1740   }
1741 
1742   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1743 
1744   Value *Iden = nullptr;
1745   RecurKind RK = RdxDesc.getRecurrenceKind();
1746   if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1747       RecurrenceDescriptor::isAnyOfRecurrenceKind(RK)) {
1748     // MinMax and AnyOf reductions have the start value as their identity.
1749     if (ScalarPHI) {
1750       Iden = StartV;
1751     } else {
1752       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1753       Builder.SetInsertPoint(VectorPH->getTerminator());
1754       StartV = Iden =
1755           Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1756     }
1757   } else {
1758     Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1759                                          RdxDesc.getFastMathFlags());
1760 
1761     if (!ScalarPHI) {
1762       Iden = Builder.CreateVectorSplat(State.VF, Iden);
1763       IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1764       Builder.SetInsertPoint(VectorPH->getTerminator());
1765       Constant *Zero = Builder.getInt32(0);
1766       StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1767     }
1768   }
1769 
1770   for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1771     Value *EntryPart = State.get(this, Part);
1772     // Make sure to add the reduction start value only to the
1773     // first unroll part.
1774     Value *StartVal = (Part == 0) ? StartV : Iden;
1775     cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1776   }
1777 }
1778 
1779 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1780 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1781                                  VPSlotTracker &SlotTracker) const {
1782   O << Indent << "WIDEN-REDUCTION-PHI ";
1783 
1784   printAsOperand(O, SlotTracker);
1785   O << " = phi ";
1786   printOperands(O, SlotTracker);
1787 }
1788 #endif
1789 
1790 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1791   assert(EnableVPlanNativePath &&
1792          "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1793 
1794   Value *Op0 = State.get(getOperand(0), 0);
1795   Type *VecTy = Op0->getType();
1796   Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1797   State.set(this, VecPhi, 0);
1798 }
1799 
1800 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1801 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1802                              VPSlotTracker &SlotTracker) const {
1803   O << Indent << "WIDEN-PHI ";
1804 
1805   auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1806   // Unless all incoming values are modeled in VPlan  print the original PHI
1807   // directly.
1808   // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1809   // values as VPValues.
1810   if (getNumOperands() != OriginalPhi->getNumOperands()) {
1811     O << VPlanIngredient(OriginalPhi);
1812     return;
1813   }
1814 
1815   printAsOperand(O, SlotTracker);
1816   O << " = phi ";
1817   printOperands(O, SlotTracker);
1818 }
1819 #endif
1820 
1821 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1822 // remove VPActiveLaneMaskPHIRecipe.
1823 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1824   BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1825   for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1826     Value *StartMask = State.get(getOperand(0), Part);
1827     PHINode *EntryPart =
1828         State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1829     EntryPart->addIncoming(StartMask, VectorPH);
1830     EntryPart->setDebugLoc(getDebugLoc());
1831     State.set(this, EntryPart, Part);
1832   }
1833 }
1834 
1835 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1836 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1837                                       VPSlotTracker &SlotTracker) const {
1838   O << Indent << "ACTIVE-LANE-MASK-PHI ";
1839 
1840   printAsOperand(O, SlotTracker);
1841   O << " = phi ";
1842   printOperands(O, SlotTracker);
1843 }
1844 #endif
1845