1 //===- VPlanRecipes.cpp - Implementations for VPlan recipes ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file contains implementations for different VPlan recipes.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "VPlan.h"
15 #include "llvm/ADT/STLExtras.h"
16 #include "llvm/ADT/SmallVector.h"
17 #include "llvm/ADT/Twine.h"
18 #include "llvm/Analysis/IVDescriptors.h"
19 #include "llvm/IR/BasicBlock.h"
20 #include "llvm/IR/IRBuilder.h"
21 #include "llvm/IR/Instruction.h"
22 #include "llvm/IR/Instructions.h"
23 #include "llvm/IR/Type.h"
24 #include "llvm/IR/Value.h"
25 #include "llvm/Support/Casting.h"
26 #include "llvm/Support/CommandLine.h"
27 #include "llvm/Support/Debug.h"
28 #include "llvm/Support/raw_ostream.h"
29 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
30 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
31 #include <cassert>
32
33 using namespace llvm;
34
35 using VectorParts = SmallVector<Value *, 2>;
36
37 extern cl::opt<bool> EnableVPlanNativePath;
38
39 #define LV_NAME "loop-vectorize"
40 #define DEBUG_TYPE LV_NAME
41
mayWriteToMemory() const42 bool VPRecipeBase::mayWriteToMemory() const {
43 switch (getVPDefID()) {
44 case VPWidenMemoryInstructionSC: {
45 return cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
46 }
47 case VPReplicateSC:
48 case VPWidenCallSC:
49 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
50 ->mayWriteToMemory();
51 case VPBranchOnMaskSC:
52 case VPScalarIVStepsSC:
53 return false;
54 case VPWidenIntOrFpInductionSC:
55 case VPWidenCanonicalIVSC:
56 case VPWidenPHISC:
57 case VPBlendSC:
58 case VPWidenSC:
59 case VPWidenGEPSC:
60 case VPReductionSC:
61 case VPWidenSelectSC: {
62 const Instruction *I =
63 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
64 (void)I;
65 assert((!I || !I->mayWriteToMemory()) &&
66 "underlying instruction may write to memory");
67 return false;
68 }
69 default:
70 return true;
71 }
72 }
73
mayReadFromMemory() const74 bool VPRecipeBase::mayReadFromMemory() const {
75 switch (getVPDefID()) {
76 case VPWidenMemoryInstructionSC: {
77 return !cast<VPWidenMemoryInstructionRecipe>(this)->isStore();
78 }
79 case VPReplicateSC:
80 case VPWidenCallSC:
81 return cast<Instruction>(getVPSingleValue()->getUnderlyingValue())
82 ->mayReadFromMemory();
83 case VPBranchOnMaskSC:
84 case VPScalarIVStepsSC:
85 return false;
86 case VPWidenIntOrFpInductionSC:
87 case VPWidenCanonicalIVSC:
88 case VPWidenPHISC:
89 case VPBlendSC:
90 case VPWidenSC:
91 case VPWidenGEPSC:
92 case VPReductionSC:
93 case VPWidenSelectSC: {
94 const Instruction *I =
95 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
96 (void)I;
97 assert((!I || !I->mayReadFromMemory()) &&
98 "underlying instruction may read from memory");
99 return false;
100 }
101 default:
102 return true;
103 }
104 }
105
mayHaveSideEffects() const106 bool VPRecipeBase::mayHaveSideEffects() const {
107 switch (getVPDefID()) {
108 case VPDerivedIVSC:
109 case VPPredInstPHISC:
110 return false;
111 case VPWidenIntOrFpInductionSC:
112 case VPWidenPointerInductionSC:
113 case VPWidenCanonicalIVSC:
114 case VPWidenPHISC:
115 case VPBlendSC:
116 case VPWidenSC:
117 case VPWidenGEPSC:
118 case VPReductionSC:
119 case VPWidenSelectSC:
120 case VPScalarIVStepsSC: {
121 const Instruction *I =
122 dyn_cast_or_null<Instruction>(getVPSingleValue()->getUnderlyingValue());
123 (void)I;
124 assert((!I || !I->mayHaveSideEffects()) &&
125 "underlying instruction has side-effects");
126 return false;
127 }
128 case VPReplicateSC: {
129 auto *R = cast<VPReplicateRecipe>(this);
130 return R->getUnderlyingInstr()->mayHaveSideEffects();
131 }
132 default:
133 return true;
134 }
135 }
136
fixPhi(VPlan & Plan,VPTransformState & State)137 void VPLiveOut::fixPhi(VPlan &Plan, VPTransformState &State) {
138 auto Lane = VPLane::getLastLaneForVF(State.VF);
139 VPValue *ExitValue = getOperand(0);
140 if (vputils::isUniformAfterVectorization(ExitValue))
141 Lane = VPLane::getFirstLane();
142 Phi->addIncoming(State.get(ExitValue, VPIteration(State.UF - 1, Lane)),
143 State.Builder.GetInsertBlock());
144 }
145
insertBefore(VPRecipeBase * InsertPos)146 void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
147 assert(!Parent && "Recipe already in some VPBasicBlock");
148 assert(InsertPos->getParent() &&
149 "Insertion position not in any VPBasicBlock");
150 Parent = InsertPos->getParent();
151 Parent->getRecipeList().insert(InsertPos->getIterator(), this);
152 }
153
insertBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)154 void VPRecipeBase::insertBefore(VPBasicBlock &BB,
155 iplist<VPRecipeBase>::iterator I) {
156 assert(!Parent && "Recipe already in some VPBasicBlock");
157 assert(I == BB.end() || I->getParent() == &BB);
158 Parent = &BB;
159 BB.getRecipeList().insert(I, this);
160 }
161
insertAfter(VPRecipeBase * InsertPos)162 void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
163 assert(!Parent && "Recipe already in some VPBasicBlock");
164 assert(InsertPos->getParent() &&
165 "Insertion position not in any VPBasicBlock");
166 Parent = InsertPos->getParent();
167 Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
168 }
169
removeFromParent()170 void VPRecipeBase::removeFromParent() {
171 assert(getParent() && "Recipe not in any VPBasicBlock");
172 getParent()->getRecipeList().remove(getIterator());
173 Parent = nullptr;
174 }
175
eraseFromParent()176 iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
177 assert(getParent() && "Recipe not in any VPBasicBlock");
178 return getParent()->getRecipeList().erase(getIterator());
179 }
180
moveAfter(VPRecipeBase * InsertPos)181 void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
182 removeFromParent();
183 insertAfter(InsertPos);
184 }
185
moveBefore(VPBasicBlock & BB,iplist<VPRecipeBase>::iterator I)186 void VPRecipeBase::moveBefore(VPBasicBlock &BB,
187 iplist<VPRecipeBase>::iterator I) {
188 removeFromParent();
189 insertBefore(BB, I);
190 }
191
generateInstruction(VPTransformState & State,unsigned Part)192 void VPInstruction::generateInstruction(VPTransformState &State,
193 unsigned Part) {
194 IRBuilderBase &Builder = State.Builder;
195 Builder.SetCurrentDebugLocation(DL);
196
197 if (Instruction::isBinaryOp(getOpcode())) {
198 Value *A = State.get(getOperand(0), Part);
199 Value *B = State.get(getOperand(1), Part);
200 Value *V =
201 Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B, Name);
202 State.set(this, V, Part);
203 return;
204 }
205
206 switch (getOpcode()) {
207 case VPInstruction::Not: {
208 Value *A = State.get(getOperand(0), Part);
209 Value *V = Builder.CreateNot(A, Name);
210 State.set(this, V, Part);
211 break;
212 }
213 case VPInstruction::ICmpULE: {
214 Value *IV = State.get(getOperand(0), Part);
215 Value *TC = State.get(getOperand(1), Part);
216 Value *V = Builder.CreateICmpULE(IV, TC, Name);
217 State.set(this, V, Part);
218 break;
219 }
220 case Instruction::Select: {
221 Value *Cond = State.get(getOperand(0), Part);
222 Value *Op1 = State.get(getOperand(1), Part);
223 Value *Op2 = State.get(getOperand(2), Part);
224 Value *V = Builder.CreateSelect(Cond, Op1, Op2, Name);
225 State.set(this, V, Part);
226 break;
227 }
228 case VPInstruction::ActiveLaneMask: {
229 // Get first lane of vector induction variable.
230 Value *VIVElem0 = State.get(getOperand(0), VPIteration(Part, 0));
231 // Get the original loop tripcount.
232 Value *ScalarTC = State.get(getOperand(1), Part);
233
234 auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
235 auto *PredTy = VectorType::get(Int1Ty, State.VF);
236 Instruction *Call = Builder.CreateIntrinsic(
237 Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
238 {VIVElem0, ScalarTC}, nullptr, Name);
239 State.set(this, Call, Part);
240 break;
241 }
242 case VPInstruction::FirstOrderRecurrenceSplice: {
243 // Generate code to combine the previous and current values in vector v3.
244 //
245 // vector.ph:
246 // v_init = vector(..., ..., ..., a[-1])
247 // br vector.body
248 //
249 // vector.body
250 // i = phi [0, vector.ph], [i+4, vector.body]
251 // v1 = phi [v_init, vector.ph], [v2, vector.body]
252 // v2 = a[i, i+1, i+2, i+3];
253 // v3 = vector(v1(3), v2(0, 1, 2))
254
255 // For the first part, use the recurrence phi (v1), otherwise v2.
256 auto *V1 = State.get(getOperand(0), 0);
257 Value *PartMinus1 = Part == 0 ? V1 : State.get(getOperand(1), Part - 1);
258 if (!PartMinus1->getType()->isVectorTy()) {
259 State.set(this, PartMinus1, Part);
260 } else {
261 Value *V2 = State.get(getOperand(1), Part);
262 State.set(this, Builder.CreateVectorSplice(PartMinus1, V2, -1, Name),
263 Part);
264 }
265 break;
266 }
267 case VPInstruction::CanonicalIVIncrement:
268 case VPInstruction::CanonicalIVIncrementNUW: {
269 Value *Next = nullptr;
270 if (Part == 0) {
271 bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementNUW;
272 auto *Phi = State.get(getOperand(0), 0);
273 // The loop step is equal to the vectorization factor (num of SIMD
274 // elements) times the unroll factor (num of SIMD instructions).
275 Value *Step =
276 createStepForVF(Builder, Phi->getType(), State.VF, State.UF);
277 Next = Builder.CreateAdd(Phi, Step, Name, IsNUW, false);
278 } else {
279 Next = State.get(this, 0);
280 }
281
282 State.set(this, Next, Part);
283 break;
284 }
285
286 case VPInstruction::CanonicalIVIncrementForPart:
287 case VPInstruction::CanonicalIVIncrementForPartNUW: {
288 bool IsNUW = getOpcode() == VPInstruction::CanonicalIVIncrementForPartNUW;
289 auto *IV = State.get(getOperand(0), VPIteration(0, 0));
290 if (Part == 0) {
291 State.set(this, IV, Part);
292 break;
293 }
294
295 // The canonical IV is incremented by the vectorization factor (num of SIMD
296 // elements) times the unroll part.
297 Value *Step = createStepForVF(Builder, IV->getType(), State.VF, Part);
298 Value *Next = Builder.CreateAdd(IV, Step, Name, IsNUW, false);
299 State.set(this, Next, Part);
300 break;
301 }
302 case VPInstruction::BranchOnCond: {
303 if (Part != 0)
304 break;
305
306 Value *Cond = State.get(getOperand(0), VPIteration(Part, 0));
307 VPRegionBlock *ParentRegion = getParent()->getParent();
308 VPBasicBlock *Header = ParentRegion->getEntryBasicBlock();
309
310 // Replace the temporary unreachable terminator with a new conditional
311 // branch, hooking it up to backward destination for exiting blocks now and
312 // to forward destination(s) later when they are created.
313 BranchInst *CondBr =
314 Builder.CreateCondBr(Cond, Builder.GetInsertBlock(), nullptr);
315
316 if (getParent()->isExiting())
317 CondBr->setSuccessor(1, State.CFG.VPBB2IRBB[Header]);
318
319 CondBr->setSuccessor(0, nullptr);
320 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
321 break;
322 }
323 case VPInstruction::BranchOnCount: {
324 if (Part != 0)
325 break;
326 // First create the compare.
327 Value *IV = State.get(getOperand(0), Part);
328 Value *TC = State.get(getOperand(1), Part);
329 Value *Cond = Builder.CreateICmpEQ(IV, TC);
330
331 // Now create the branch.
332 auto *Plan = getParent()->getPlan();
333 VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
334 VPBasicBlock *Header = TopRegion->getEntry()->getEntryBasicBlock();
335
336 // Replace the temporary unreachable terminator with a new conditional
337 // branch, hooking it up to backward destination (the header) now and to the
338 // forward destination (the exit/middle block) later when it is created.
339 // Note that CreateCondBr expects a valid BB as first argument, so we need
340 // to set it to nullptr later.
341 BranchInst *CondBr = Builder.CreateCondBr(Cond, Builder.GetInsertBlock(),
342 State.CFG.VPBB2IRBB[Header]);
343 CondBr->setSuccessor(0, nullptr);
344 Builder.GetInsertBlock()->getTerminator()->eraseFromParent();
345 break;
346 }
347 default:
348 llvm_unreachable("Unsupported opcode for instruction");
349 }
350 }
351
execute(VPTransformState & State)352 void VPInstruction::execute(VPTransformState &State) {
353 assert(!State.Instance && "VPInstruction executing an Instance");
354 IRBuilderBase::FastMathFlagGuard FMFGuard(State.Builder);
355 State.Builder.setFastMathFlags(FMF);
356 for (unsigned Part = 0; Part < State.UF; ++Part)
357 generateInstruction(State, Part);
358 }
359
360 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
dump() const361 void VPInstruction::dump() const {
362 VPSlotTracker SlotTracker(getParent()->getPlan());
363 print(dbgs(), "", SlotTracker);
364 }
365
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const366 void VPInstruction::print(raw_ostream &O, const Twine &Indent,
367 VPSlotTracker &SlotTracker) const {
368 O << Indent << "EMIT ";
369
370 if (hasResult()) {
371 printAsOperand(O, SlotTracker);
372 O << " = ";
373 }
374
375 switch (getOpcode()) {
376 case VPInstruction::Not:
377 O << "not";
378 break;
379 case VPInstruction::ICmpULE:
380 O << "icmp ule";
381 break;
382 case VPInstruction::SLPLoad:
383 O << "combined load";
384 break;
385 case VPInstruction::SLPStore:
386 O << "combined store";
387 break;
388 case VPInstruction::ActiveLaneMask:
389 O << "active lane mask";
390 break;
391 case VPInstruction::FirstOrderRecurrenceSplice:
392 O << "first-order splice";
393 break;
394 case VPInstruction::CanonicalIVIncrement:
395 O << "VF * UF + ";
396 break;
397 case VPInstruction::CanonicalIVIncrementNUW:
398 O << "VF * UF +(nuw) ";
399 break;
400 case VPInstruction::BranchOnCond:
401 O << "branch-on-cond";
402 break;
403 case VPInstruction::CanonicalIVIncrementForPart:
404 O << "VF * Part + ";
405 break;
406 case VPInstruction::CanonicalIVIncrementForPartNUW:
407 O << "VF * Part +(nuw) ";
408 break;
409 case VPInstruction::BranchOnCount:
410 O << "branch-on-count ";
411 break;
412 default:
413 O << Instruction::getOpcodeName(getOpcode());
414 }
415
416 O << FMF;
417
418 for (const VPValue *Operand : operands()) {
419 O << " ";
420 Operand->printAsOperand(O, SlotTracker);
421 }
422
423 if (DL) {
424 O << ", !dbg ";
425 DL.print(O);
426 }
427 }
428 #endif
429
setFastMathFlags(FastMathFlags FMFNew)430 void VPInstruction::setFastMathFlags(FastMathFlags FMFNew) {
431 // Make sure the VPInstruction is a floating-point operation.
432 assert((Opcode == Instruction::FAdd || Opcode == Instruction::FMul ||
433 Opcode == Instruction::FNeg || Opcode == Instruction::FSub ||
434 Opcode == Instruction::FDiv || Opcode == Instruction::FRem ||
435 Opcode == Instruction::FCmp) &&
436 "this op can't take fast-math flags");
437 FMF = FMFNew;
438 }
439
execute(VPTransformState & State)440 void VPWidenCallRecipe::execute(VPTransformState &State) {
441 auto &CI = *cast<CallInst>(getUnderlyingInstr());
442 assert(!isa<DbgInfoIntrinsic>(CI) &&
443 "DbgInfoIntrinsic should have been dropped during VPlan construction");
444 State.setDebugLocFromInst(&CI);
445
446 SmallVector<Type *, 4> Tys;
447 for (Value *ArgOperand : CI.args())
448 Tys.push_back(
449 ToVectorTy(ArgOperand->getType(), State.VF.getKnownMinValue()));
450
451 for (unsigned Part = 0; Part < State.UF; ++Part) {
452 SmallVector<Type *, 2> TysForDecl = {CI.getType()};
453 SmallVector<Value *, 4> Args;
454 for (const auto &I : enumerate(operands())) {
455 // Some intrinsics have a scalar argument - don't replace it with a
456 // vector.
457 Value *Arg;
458 if (VectorIntrinsicID == Intrinsic::not_intrinsic ||
459 !isVectorIntrinsicWithScalarOpAtArg(VectorIntrinsicID, I.index()))
460 Arg = State.get(I.value(), Part);
461 else
462 Arg = State.get(I.value(), VPIteration(0, 0));
463 if (isVectorIntrinsicWithOverloadTypeAtArg(VectorIntrinsicID, I.index()))
464 TysForDecl.push_back(Arg->getType());
465 Args.push_back(Arg);
466 }
467
468 Function *VectorF;
469 if (VectorIntrinsicID != Intrinsic::not_intrinsic) {
470 // Use vector version of the intrinsic.
471 if (State.VF.isVector())
472 TysForDecl[0] =
473 VectorType::get(CI.getType()->getScalarType(), State.VF);
474 Module *M = State.Builder.GetInsertBlock()->getModule();
475 VectorF = Intrinsic::getDeclaration(M, VectorIntrinsicID, TysForDecl);
476 assert(VectorF && "Can't retrieve vector intrinsic.");
477 } else {
478 // Use vector version of the function call.
479 const VFShape Shape = VFShape::get(CI, State.VF, false /*HasGlobalPred*/);
480 #ifndef NDEBUG
481 assert(VFDatabase(CI).getVectorizedFunction(Shape) != nullptr &&
482 "Can't create vector function.");
483 #endif
484 VectorF = VFDatabase(CI).getVectorizedFunction(Shape);
485 }
486 SmallVector<OperandBundleDef, 1> OpBundles;
487 CI.getOperandBundlesAsDefs(OpBundles);
488 CallInst *V = State.Builder.CreateCall(VectorF, Args, OpBundles);
489
490 if (isa<FPMathOperator>(V))
491 V->copyFastMathFlags(&CI);
492
493 State.set(this, V, Part);
494 State.addMetadata(V, &CI);
495 }
496 }
497
498 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const499 void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
500 VPSlotTracker &SlotTracker) const {
501 O << Indent << "WIDEN-CALL ";
502
503 auto *CI = cast<CallInst>(getUnderlyingInstr());
504 if (CI->getType()->isVoidTy())
505 O << "void ";
506 else {
507 printAsOperand(O, SlotTracker);
508 O << " = ";
509 }
510
511 O << "call @" << CI->getCalledFunction()->getName() << "(";
512 printOperands(O, SlotTracker);
513 O << ")";
514
515 if (VectorIntrinsicID)
516 O << " (using vector intrinsic)";
517 else
518 O << " (using library function)";
519 }
520
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const521 void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
522 VPSlotTracker &SlotTracker) const {
523 O << Indent << "WIDEN-SELECT ";
524 printAsOperand(O, SlotTracker);
525 O << " = select ";
526 getOperand(0)->printAsOperand(O, SlotTracker);
527 O << ", ";
528 getOperand(1)->printAsOperand(O, SlotTracker);
529 O << ", ";
530 getOperand(2)->printAsOperand(O, SlotTracker);
531 O << (InvariantCond ? " (condition is loop invariant)" : "");
532 }
533 #endif
534
execute(VPTransformState & State)535 void VPWidenSelectRecipe::execute(VPTransformState &State) {
536 auto &I = *cast<SelectInst>(getUnderlyingInstr());
537 State.setDebugLocFromInst(&I);
538
539 // The condition can be loop invariant but still defined inside the
540 // loop. This means that we can't just use the original 'cond' value.
541 // We have to take the 'vectorized' value and pick the first lane.
542 // Instcombine will make this a no-op.
543 auto *InvarCond =
544 InvariantCond ? State.get(getOperand(0), VPIteration(0, 0)) : nullptr;
545
546 for (unsigned Part = 0; Part < State.UF; ++Part) {
547 Value *Cond = InvarCond ? InvarCond : State.get(getOperand(0), Part);
548 Value *Op0 = State.get(getOperand(1), Part);
549 Value *Op1 = State.get(getOperand(2), Part);
550 Value *Sel = State.Builder.CreateSelect(Cond, Op0, Op1);
551 State.set(this, Sel, Part);
552 State.addMetadata(Sel, &I);
553 }
554 }
555
execute(VPTransformState & State)556 void VPWidenRecipe::execute(VPTransformState &State) {
557 auto &I = *cast<Instruction>(getUnderlyingValue());
558 auto &Builder = State.Builder;
559 switch (I.getOpcode()) {
560 case Instruction::Call:
561 case Instruction::Br:
562 case Instruction::PHI:
563 case Instruction::GetElementPtr:
564 case Instruction::Select:
565 llvm_unreachable("This instruction is handled by a different recipe.");
566 case Instruction::UDiv:
567 case Instruction::SDiv:
568 case Instruction::SRem:
569 case Instruction::URem:
570 case Instruction::Add:
571 case Instruction::FAdd:
572 case Instruction::Sub:
573 case Instruction::FSub:
574 case Instruction::FNeg:
575 case Instruction::Mul:
576 case Instruction::FMul:
577 case Instruction::FDiv:
578 case Instruction::FRem:
579 case Instruction::Shl:
580 case Instruction::LShr:
581 case Instruction::AShr:
582 case Instruction::And:
583 case Instruction::Or:
584 case Instruction::Xor: {
585 // Just widen unops and binops.
586 State.setDebugLocFromInst(&I);
587
588 for (unsigned Part = 0; Part < State.UF; ++Part) {
589 SmallVector<Value *, 2> Ops;
590 for (VPValue *VPOp : operands())
591 Ops.push_back(State.get(VPOp, Part));
592
593 Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
594
595 if (auto *VecOp = dyn_cast<Instruction>(V)) {
596 VecOp->copyIRFlags(&I);
597
598 // If the instruction is vectorized and was in a basic block that needed
599 // predication, we can't propagate poison-generating flags (nuw/nsw,
600 // exact, etc.). The control flow has been linearized and the
601 // instruction is no longer guarded by the predicate, which could make
602 // the flag properties to no longer hold.
603 if (State.MayGeneratePoisonRecipes.contains(this))
604 VecOp->dropPoisonGeneratingFlags();
605 }
606
607 // Use this vector value for all users of the original instruction.
608 State.set(this, V, Part);
609 State.addMetadata(V, &I);
610 }
611
612 break;
613 }
614 case Instruction::Freeze: {
615 State.setDebugLocFromInst(&I);
616
617 for (unsigned Part = 0; Part < State.UF; ++Part) {
618 Value *Op = State.get(getOperand(0), Part);
619
620 Value *Freeze = Builder.CreateFreeze(Op);
621 State.set(this, Freeze, Part);
622 }
623 break;
624 }
625 case Instruction::ICmp:
626 case Instruction::FCmp: {
627 // Widen compares. Generate vector compares.
628 bool FCmp = (I.getOpcode() == Instruction::FCmp);
629 auto *Cmp = cast<CmpInst>(&I);
630 State.setDebugLocFromInst(Cmp);
631 for (unsigned Part = 0; Part < State.UF; ++Part) {
632 Value *A = State.get(getOperand(0), Part);
633 Value *B = State.get(getOperand(1), Part);
634 Value *C = nullptr;
635 if (FCmp) {
636 // Propagate fast math flags.
637 IRBuilder<>::FastMathFlagGuard FMFG(Builder);
638 Builder.setFastMathFlags(Cmp->getFastMathFlags());
639 C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
640 } else {
641 C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
642 }
643 State.set(this, C, Part);
644 State.addMetadata(C, &I);
645 }
646
647 break;
648 }
649
650 case Instruction::ZExt:
651 case Instruction::SExt:
652 case Instruction::FPToUI:
653 case Instruction::FPToSI:
654 case Instruction::FPExt:
655 case Instruction::PtrToInt:
656 case Instruction::IntToPtr:
657 case Instruction::SIToFP:
658 case Instruction::UIToFP:
659 case Instruction::Trunc:
660 case Instruction::FPTrunc:
661 case Instruction::BitCast: {
662 auto *CI = cast<CastInst>(&I);
663 State.setDebugLocFromInst(CI);
664
665 /// Vectorize casts.
666 Type *DestTy = (State.VF.isScalar())
667 ? CI->getType()
668 : VectorType::get(CI->getType(), State.VF);
669
670 for (unsigned Part = 0; Part < State.UF; ++Part) {
671 Value *A = State.get(getOperand(0), Part);
672 Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
673 State.set(this, Cast, Part);
674 State.addMetadata(Cast, &I);
675 }
676 break;
677 }
678 default:
679 // This instruction is not vectorized by simple widening.
680 LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
681 llvm_unreachable("Unhandled instruction!");
682 } // end of switch.
683 }
684 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const685 void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
686 VPSlotTracker &SlotTracker) const {
687 O << Indent << "WIDEN ";
688 printAsOperand(O, SlotTracker);
689 const Instruction *UI = getUnderlyingInstr();
690 O << " = " << UI->getOpcodeName() << " ";
691 if (auto *Cmp = dyn_cast<CmpInst>(UI))
692 O << CmpInst::getPredicateName(Cmp->getPredicate()) << " ";
693 printOperands(O, SlotTracker);
694 }
695
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const696 void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
697 VPSlotTracker &SlotTracker) const {
698 O << Indent << "WIDEN-INDUCTION";
699 if (getTruncInst()) {
700 O << "\\l\"";
701 O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
702 O << " +\n" << Indent << "\" ";
703 getVPValue(0)->printAsOperand(O, SlotTracker);
704 } else
705 O << " " << VPlanIngredient(IV);
706
707 O << ", ";
708 getStepValue()->printAsOperand(O, SlotTracker);
709 }
710 #endif
711
isCanonical() const712 bool VPWidenIntOrFpInductionRecipe::isCanonical() const {
713 auto *StartC = dyn_cast<ConstantInt>(getStartValue()->getLiveInIRValue());
714 auto *StepC = dyn_cast<SCEVConstant>(getInductionDescriptor().getStep());
715 return StartC && StartC->isZero() && StepC && StepC->isOne();
716 }
717
718 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const719 void VPDerivedIVRecipe::print(raw_ostream &O, const Twine &Indent,
720 VPSlotTracker &SlotTracker) const {
721 O << Indent;
722 printAsOperand(O, SlotTracker);
723 O << Indent << "= DERIVED-IV ";
724 getStartValue()->printAsOperand(O, SlotTracker);
725 O << " + ";
726 getCanonicalIV()->printAsOperand(O, SlotTracker);
727 O << " * ";
728 getStepValue()->printAsOperand(O, SlotTracker);
729
730 if (IndDesc.getStep()->getType() != ResultTy)
731 O << " (truncated to " << *ResultTy << ")";
732 }
733 #endif
734
735 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const736 void VPScalarIVStepsRecipe::print(raw_ostream &O, const Twine &Indent,
737 VPSlotTracker &SlotTracker) const {
738 O << Indent;
739 printAsOperand(O, SlotTracker);
740 O << Indent << "= SCALAR-STEPS ";
741 printOperands(O, SlotTracker);
742 }
743 #endif
744
execute(VPTransformState & State)745 void VPWidenGEPRecipe::execute(VPTransformState &State) {
746 auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
747 // Construct a vector GEP by widening the operands of the scalar GEP as
748 // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
749 // results in a vector of pointers when at least one operand of the GEP
750 // is vector-typed. Thus, to keep the representation compact, we only use
751 // vector-typed operands for loop-varying values.
752
753 if (State.VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
754 // If we are vectorizing, but the GEP has only loop-invariant operands,
755 // the GEP we build (by only using vector-typed operands for
756 // loop-varying values) would be a scalar pointer. Thus, to ensure we
757 // produce a vector of pointers, we need to either arbitrarily pick an
758 // operand to broadcast, or broadcast a clone of the original GEP.
759 // Here, we broadcast a clone of the original.
760 //
761 // TODO: If at some point we decide to scalarize instructions having
762 // loop-invariant operands, this special case will no longer be
763 // required. We would add the scalarization decision to
764 // collectLoopScalars() and teach getVectorValue() to broadcast
765 // the lane-zero scalar value.
766 auto *Clone = State.Builder.Insert(GEP->clone());
767 for (unsigned Part = 0; Part < State.UF; ++Part) {
768 Value *EntryPart = State.Builder.CreateVectorSplat(State.VF, Clone);
769 State.set(this, EntryPart, Part);
770 State.addMetadata(EntryPart, GEP);
771 }
772 } else {
773 // If the GEP has at least one loop-varying operand, we are sure to
774 // produce a vector of pointers. But if we are only unrolling, we want
775 // to produce a scalar GEP for each unroll part. Thus, the GEP we
776 // produce with the code below will be scalar (if VF == 1) or vector
777 // (otherwise). Note that for the unroll-only case, we still maintain
778 // values in the vector mapping with initVector, as we do for other
779 // instructions.
780 for (unsigned Part = 0; Part < State.UF; ++Part) {
781 // The pointer operand of the new GEP. If it's loop-invariant, we
782 // won't broadcast it.
783 auto *Ptr = IsPtrLoopInvariant
784 ? State.get(getOperand(0), VPIteration(0, 0))
785 : State.get(getOperand(0), Part);
786
787 // Collect all the indices for the new GEP. If any index is
788 // loop-invariant, we won't broadcast it.
789 SmallVector<Value *, 4> Indices;
790 for (unsigned I = 1, E = getNumOperands(); I < E; I++) {
791 VPValue *Operand = getOperand(I);
792 if (IsIndexLoopInvariant[I - 1])
793 Indices.push_back(State.get(Operand, VPIteration(0, 0)));
794 else
795 Indices.push_back(State.get(Operand, Part));
796 }
797
798 // If the GEP instruction is vectorized and was in a basic block that
799 // needed predication, we can't propagate the poison-generating 'inbounds'
800 // flag. The control flow has been linearized and the GEP is no longer
801 // guarded by the predicate, which could make the 'inbounds' properties to
802 // no longer hold.
803 bool IsInBounds =
804 GEP->isInBounds() && State.MayGeneratePoisonRecipes.count(this) == 0;
805
806 // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
807 // but it should be a vector, otherwise.
808 auto *NewGEP = State.Builder.CreateGEP(GEP->getSourceElementType(), Ptr,
809 Indices, "", IsInBounds);
810 assert((State.VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
811 "NewGEP is not a pointer vector");
812 State.set(this, NewGEP, Part);
813 State.addMetadata(NewGEP, GEP);
814 }
815 }
816 }
817
818 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const819 void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
820 VPSlotTracker &SlotTracker) const {
821 O << Indent << "WIDEN-GEP ";
822 O << (IsPtrLoopInvariant ? "Inv" : "Var");
823 size_t IndicesNumber = IsIndexLoopInvariant.size();
824 for (size_t I = 0; I < IndicesNumber; ++I)
825 O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
826
827 O << " ";
828 printAsOperand(O, SlotTracker);
829 O << " = getelementptr ";
830 printOperands(O, SlotTracker);
831 }
832 #endif
833
execute(VPTransformState & State)834 void VPBlendRecipe::execute(VPTransformState &State) {
835 State.setDebugLocFromInst(Phi);
836 // We know that all PHIs in non-header blocks are converted into
837 // selects, so we don't have to worry about the insertion order and we
838 // can just use the builder.
839 // At this point we generate the predication tree. There may be
840 // duplications since this is a simple recursive scan, but future
841 // optimizations will clean it up.
842
843 unsigned NumIncoming = getNumIncomingValues();
844
845 // Generate a sequence of selects of the form:
846 // SELECT(Mask3, In3,
847 // SELECT(Mask2, In2,
848 // SELECT(Mask1, In1,
849 // In0)))
850 // Note that Mask0 is never used: lanes for which no path reaches this phi and
851 // are essentially undef are taken from In0.
852 VectorParts Entry(State.UF);
853 for (unsigned In = 0; In < NumIncoming; ++In) {
854 for (unsigned Part = 0; Part < State.UF; ++Part) {
855 // We might have single edge PHIs (blocks) - use an identity
856 // 'select' for the first PHI operand.
857 Value *In0 = State.get(getIncomingValue(In), Part);
858 if (In == 0)
859 Entry[Part] = In0; // Initialize with the first incoming value.
860 else {
861 // Select between the current value and the previous incoming edge
862 // based on the incoming mask.
863 Value *Cond = State.get(getMask(In), Part);
864 Entry[Part] =
865 State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
866 }
867 }
868 }
869 for (unsigned Part = 0; Part < State.UF; ++Part)
870 State.set(this, Entry[Part], Part);
871 }
872
873 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const874 void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
875 VPSlotTracker &SlotTracker) const {
876 O << Indent << "BLEND ";
877 Phi->printAsOperand(O, false);
878 O << " =";
879 if (getNumIncomingValues() == 1) {
880 // Not a User of any mask: not really blending, this is a
881 // single-predecessor phi.
882 O << " ";
883 getIncomingValue(0)->printAsOperand(O, SlotTracker);
884 } else {
885 for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
886 O << " ";
887 getIncomingValue(I)->printAsOperand(O, SlotTracker);
888 O << "/";
889 getMask(I)->printAsOperand(O, SlotTracker);
890 }
891 }
892 }
893
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const894 void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
895 VPSlotTracker &SlotTracker) const {
896 O << Indent << "REDUCE ";
897 printAsOperand(O, SlotTracker);
898 O << " = ";
899 getChainOp()->printAsOperand(O, SlotTracker);
900 O << " +";
901 if (isa<FPMathOperator>(getUnderlyingInstr()))
902 O << getUnderlyingInstr()->getFastMathFlags();
903 O << " reduce." << Instruction::getOpcodeName(RdxDesc->getOpcode()) << " (";
904 getVecOp()->printAsOperand(O, SlotTracker);
905 if (getCondOp()) {
906 O << ", ";
907 getCondOp()->printAsOperand(O, SlotTracker);
908 }
909 O << ")";
910 if (RdxDesc->IntermediateStore)
911 O << " (with final reduction value stored in invariant address sank "
912 "outside of loop)";
913 }
914
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const915 void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
916 VPSlotTracker &SlotTracker) const {
917 O << Indent << (IsUniform ? "CLONE " : "REPLICATE ");
918
919 if (!getUnderlyingInstr()->getType()->isVoidTy()) {
920 printAsOperand(O, SlotTracker);
921 O << " = ";
922 }
923 if (auto *CB = dyn_cast<CallBase>(getUnderlyingInstr())) {
924 O << "call @" << CB->getCalledFunction()->getName() << "(";
925 interleaveComma(make_range(op_begin(), op_begin() + (getNumOperands() - 1)),
926 O, [&O, &SlotTracker](VPValue *Op) {
927 Op->printAsOperand(O, SlotTracker);
928 });
929 O << ")";
930 } else {
931 O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
932 printOperands(O, SlotTracker);
933 }
934
935 if (AlsoPack)
936 O << " (S->V)";
937 }
938 #endif
939
execute(VPTransformState & State)940 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
941 assert(State.Instance && "Branch on Mask works only on single instance.");
942
943 unsigned Part = State.Instance->Part;
944 unsigned Lane = State.Instance->Lane.getKnownLane();
945
946 Value *ConditionBit = nullptr;
947 VPValue *BlockInMask = getMask();
948 if (BlockInMask) {
949 ConditionBit = State.get(BlockInMask, Part);
950 if (ConditionBit->getType()->isVectorTy())
951 ConditionBit = State.Builder.CreateExtractElement(
952 ConditionBit, State.Builder.getInt32(Lane));
953 } else // Block in mask is all-one.
954 ConditionBit = State.Builder.getTrue();
955
956 // Replace the temporary unreachable terminator with a new conditional branch,
957 // whose two destinations will be set later when they are created.
958 auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
959 assert(isa<UnreachableInst>(CurrentTerminator) &&
960 "Expected to replace unreachable terminator with conditional branch.");
961 auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
962 CondBr->setSuccessor(0, nullptr);
963 ReplaceInstWithInst(CurrentTerminator, CondBr);
964 }
965
execute(VPTransformState & State)966 void VPPredInstPHIRecipe::execute(VPTransformState &State) {
967 assert(State.Instance && "Predicated instruction PHI works per instance.");
968 Instruction *ScalarPredInst =
969 cast<Instruction>(State.get(getOperand(0), *State.Instance));
970 BasicBlock *PredicatedBB = ScalarPredInst->getParent();
971 BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
972 assert(PredicatingBB && "Predicated block has no single predecessor.");
973 assert(isa<VPReplicateRecipe>(getOperand(0)) &&
974 "operand must be VPReplicateRecipe");
975
976 // By current pack/unpack logic we need to generate only a single phi node: if
977 // a vector value for the predicated instruction exists at this point it means
978 // the instruction has vector users only, and a phi for the vector value is
979 // needed. In this case the recipe of the predicated instruction is marked to
980 // also do that packing, thereby "hoisting" the insert-element sequence.
981 // Otherwise, a phi node for the scalar value is needed.
982 unsigned Part = State.Instance->Part;
983 if (State.hasVectorValue(getOperand(0), Part)) {
984 Value *VectorValue = State.get(getOperand(0), Part);
985 InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
986 PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
987 VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
988 VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
989 if (State.hasVectorValue(this, Part))
990 State.reset(this, VPhi, Part);
991 else
992 State.set(this, VPhi, Part);
993 // NOTE: Currently we need to update the value of the operand, so the next
994 // predicated iteration inserts its generated value in the correct vector.
995 State.reset(getOperand(0), VPhi, Part);
996 } else {
997 Type *PredInstType = getOperand(0)->getUnderlyingValue()->getType();
998 PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
999 Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()),
1000 PredicatingBB);
1001 Phi->addIncoming(ScalarPredInst, PredicatedBB);
1002 if (State.hasScalarValue(this, *State.Instance))
1003 State.reset(this, Phi, *State.Instance);
1004 else
1005 State.set(this, Phi, *State.Instance);
1006 // NOTE: Currently we need to update the value of the operand, so the next
1007 // predicated iteration inserts its generated value in the correct vector.
1008 State.reset(getOperand(0), Phi, *State.Instance);
1009 }
1010 }
1011
1012 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1013 void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1014 VPSlotTracker &SlotTracker) const {
1015 O << Indent << "PHI-PREDICATED-INSTRUCTION ";
1016 printAsOperand(O, SlotTracker);
1017 O << " = ";
1018 printOperands(O, SlotTracker);
1019 }
1020
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1021 void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
1022 VPSlotTracker &SlotTracker) const {
1023 O << Indent << "WIDEN ";
1024
1025 if (!isStore()) {
1026 getVPSingleValue()->printAsOperand(O, SlotTracker);
1027 O << " = ";
1028 }
1029 O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
1030
1031 printOperands(O, SlotTracker);
1032 }
1033 #endif
1034
execute(VPTransformState & State)1035 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
1036 Value *Start = getStartValue()->getLiveInIRValue();
1037 PHINode *EntryPart = PHINode::Create(
1038 Start->getType(), 2, "index", &*State.CFG.PrevBB->getFirstInsertionPt());
1039
1040 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1041 EntryPart->addIncoming(Start, VectorPH);
1042 EntryPart->setDebugLoc(DL);
1043 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1044 State.set(this, EntryPart, Part);
1045 }
1046
1047 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1048 void VPCanonicalIVPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1049 VPSlotTracker &SlotTracker) const {
1050 O << Indent << "EMIT ";
1051 printAsOperand(O, SlotTracker);
1052 O << " = CANONICAL-INDUCTION";
1053 }
1054 #endif
1055
isCanonical(const InductionDescriptor & ID,Type * Ty) const1056 bool VPCanonicalIVPHIRecipe::isCanonical(const InductionDescriptor &ID,
1057 Type *Ty) const {
1058 if (Ty != getScalarType())
1059 return false;
1060 // The start value of ID must match the start value of this canonical
1061 // induction.
1062 if (getStartValue()->getLiveInIRValue() != ID.getStartValue())
1063 return false;
1064
1065 ConstantInt *Step = ID.getConstIntStepValue();
1066 // ID must also be incremented by one. IK_IntInduction always increment the
1067 // induction by Step, but the binary op may not be set.
1068 return ID.getKind() == InductionDescriptor::IK_IntInduction && Step &&
1069 Step->isOne();
1070 }
1071
onlyScalarsGenerated(ElementCount VF)1072 bool VPWidenPointerInductionRecipe::onlyScalarsGenerated(ElementCount VF) {
1073 return IsScalarAfterVectorization &&
1074 (!VF.isScalable() || vputils::onlyFirstLaneUsed(this));
1075 }
1076
1077 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1078 void VPWidenPointerInductionRecipe::print(raw_ostream &O, const Twine &Indent,
1079 VPSlotTracker &SlotTracker) const {
1080 O << Indent << "EMIT ";
1081 printAsOperand(O, SlotTracker);
1082 O << " = WIDEN-POINTER-INDUCTION ";
1083 getStartValue()->printAsOperand(O, SlotTracker);
1084 O << ", " << *IndDesc.getStep();
1085 }
1086 #endif
1087
execute(VPTransformState & State)1088 void VPExpandSCEVRecipe::execute(VPTransformState &State) {
1089 assert(!State.Instance && "cannot be used in per-lane");
1090 const DataLayout &DL = State.CFG.PrevBB->getModule()->getDataLayout();
1091 SCEVExpander Exp(SE, DL, "induction");
1092
1093 Value *Res = Exp.expandCodeFor(Expr, Expr->getType(),
1094 &*State.Builder.GetInsertPoint());
1095
1096 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part)
1097 State.set(this, Res, Part);
1098 }
1099
1100 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1101 void VPExpandSCEVRecipe::print(raw_ostream &O, const Twine &Indent,
1102 VPSlotTracker &SlotTracker) const {
1103 O << Indent << "EMIT ";
1104 getVPSingleValue()->printAsOperand(O, SlotTracker);
1105 O << " = EXPAND SCEV " << *Expr;
1106 }
1107 #endif
1108
execute(VPTransformState & State)1109 void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
1110 Value *CanonicalIV = State.get(getOperand(0), 0);
1111 Type *STy = CanonicalIV->getType();
1112 IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
1113 ElementCount VF = State.VF;
1114 Value *VStart = VF.isScalar()
1115 ? CanonicalIV
1116 : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
1117 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1118 Value *VStep = createStepForVF(Builder, STy, VF, Part);
1119 if (VF.isVector()) {
1120 VStep = Builder.CreateVectorSplat(VF, VStep);
1121 VStep =
1122 Builder.CreateAdd(VStep, Builder.CreateStepVector(VStep->getType()));
1123 }
1124 Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
1125 State.set(this, CanonicalVectorIV, Part);
1126 }
1127 }
1128
1129 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1130 void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
1131 VPSlotTracker &SlotTracker) const {
1132 O << Indent << "EMIT ";
1133 printAsOperand(O, SlotTracker);
1134 O << " = WIDEN-CANONICAL-INDUCTION ";
1135 printOperands(O, SlotTracker);
1136 }
1137 #endif
1138
execute(VPTransformState & State)1139 void VPFirstOrderRecurrencePHIRecipe::execute(VPTransformState &State) {
1140 auto &Builder = State.Builder;
1141 // Create a vector from the initial value.
1142 auto *VectorInit = getStartValue()->getLiveInIRValue();
1143
1144 Type *VecTy = State.VF.isScalar()
1145 ? VectorInit->getType()
1146 : VectorType::get(VectorInit->getType(), State.VF);
1147
1148 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1149 if (State.VF.isVector()) {
1150 auto *IdxTy = Builder.getInt32Ty();
1151 auto *One = ConstantInt::get(IdxTy, 1);
1152 IRBuilder<>::InsertPointGuard Guard(Builder);
1153 Builder.SetInsertPoint(VectorPH->getTerminator());
1154 auto *RuntimeVF = getRuntimeVF(Builder, IdxTy, State.VF);
1155 auto *LastIdx = Builder.CreateSub(RuntimeVF, One);
1156 VectorInit = Builder.CreateInsertElement(
1157 PoisonValue::get(VecTy), VectorInit, LastIdx, "vector.recur.init");
1158 }
1159
1160 // Create a phi node for the new recurrence.
1161 PHINode *EntryPart = PHINode::Create(
1162 VecTy, 2, "vector.recur", &*State.CFG.PrevBB->getFirstInsertionPt());
1163 EntryPart->addIncoming(VectorInit, VectorPH);
1164 State.set(this, EntryPart, 0);
1165 }
1166
1167 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1168 void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
1169 VPSlotTracker &SlotTracker) const {
1170 O << Indent << "FIRST-ORDER-RECURRENCE-PHI ";
1171 printAsOperand(O, SlotTracker);
1172 O << " = phi ";
1173 printOperands(O, SlotTracker);
1174 }
1175 #endif
1176
execute(VPTransformState & State)1177 void VPReductionPHIRecipe::execute(VPTransformState &State) {
1178 PHINode *PN = cast<PHINode>(getUnderlyingValue());
1179 auto &Builder = State.Builder;
1180
1181 // In order to support recurrences we need to be able to vectorize Phi nodes.
1182 // Phi nodes have cycles, so we need to vectorize them in two stages. This is
1183 // stage #1: We create a new vector PHI node with no incoming edges. We'll use
1184 // this value when we vectorize all of the instructions that use the PHI.
1185 bool ScalarPHI = State.VF.isScalar() || IsInLoop;
1186 Type *VecTy =
1187 ScalarPHI ? PN->getType() : VectorType::get(PN->getType(), State.VF);
1188
1189 BasicBlock *HeaderBB = State.CFG.PrevBB;
1190 assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
1191 "recipe must be in the vector loop header");
1192 unsigned LastPartForNewPhi = isOrdered() ? 1 : State.UF;
1193 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1194 Value *EntryPart =
1195 PHINode::Create(VecTy, 2, "vec.phi", &*HeaderBB->getFirstInsertionPt());
1196 State.set(this, EntryPart, Part);
1197 }
1198
1199 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1200
1201 // Reductions do not have to start at zero. They can start with
1202 // any loop invariant values.
1203 VPValue *StartVPV = getStartValue();
1204 Value *StartV = StartVPV->getLiveInIRValue();
1205
1206 Value *Iden = nullptr;
1207 RecurKind RK = RdxDesc.getRecurrenceKind();
1208 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(RK) ||
1209 RecurrenceDescriptor::isSelectCmpRecurrenceKind(RK)) {
1210 // MinMax reduction have the start value as their identify.
1211 if (ScalarPHI) {
1212 Iden = StartV;
1213 } else {
1214 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1215 Builder.SetInsertPoint(VectorPH->getTerminator());
1216 StartV = Iden =
1217 Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
1218 }
1219 } else {
1220 Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
1221 RdxDesc.getFastMathFlags());
1222
1223 if (!ScalarPHI) {
1224 Iden = Builder.CreateVectorSplat(State.VF, Iden);
1225 IRBuilderBase::InsertPointGuard IPBuilder(Builder);
1226 Builder.SetInsertPoint(VectorPH->getTerminator());
1227 Constant *Zero = Builder.getInt32(0);
1228 StartV = Builder.CreateInsertElement(Iden, StartV, Zero);
1229 }
1230 }
1231
1232 for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
1233 Value *EntryPart = State.get(this, Part);
1234 // Make sure to add the reduction start value only to the
1235 // first unroll part.
1236 Value *StartVal = (Part == 0) ? StartV : Iden;
1237 cast<PHINode>(EntryPart)->addIncoming(StartVal, VectorPH);
1238 }
1239 }
1240
1241 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1242 void VPReductionPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1243 VPSlotTracker &SlotTracker) const {
1244 O << Indent << "WIDEN-REDUCTION-PHI ";
1245
1246 printAsOperand(O, SlotTracker);
1247 O << " = phi ";
1248 printOperands(O, SlotTracker);
1249 }
1250 #endif
1251
execute(VPTransformState & State)1252 void VPWidenPHIRecipe::execute(VPTransformState &State) {
1253 assert(EnableVPlanNativePath &&
1254 "Non-native vplans are not expected to have VPWidenPHIRecipes.");
1255
1256 // Currently we enter here in the VPlan-native path for non-induction
1257 // PHIs where all control flow is uniform. We simply widen these PHIs.
1258 // Create a vector phi with no operands - the vector phi operands will be
1259 // set at the end of vector code generation.
1260 VPBasicBlock *Parent = getParent();
1261 VPRegionBlock *LoopRegion = Parent->getEnclosingLoopRegion();
1262 unsigned StartIdx = 0;
1263 // For phis in header blocks of loop regions, use the index of the value
1264 // coming from the preheader.
1265 if (LoopRegion->getEntryBasicBlock() == Parent) {
1266 for (unsigned I = 0; I < getNumOperands(); ++I) {
1267 if (getIncomingBlock(I) ==
1268 LoopRegion->getSinglePredecessor()->getExitingBasicBlock())
1269 StartIdx = I;
1270 }
1271 }
1272 Value *Op0 = State.get(getOperand(StartIdx), 0);
1273 Type *VecTy = Op0->getType();
1274 Value *VecPhi = State.Builder.CreatePHI(VecTy, 2, "vec.phi");
1275 State.set(this, VecPhi, 0);
1276 }
1277
1278 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1279 void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1280 VPSlotTracker &SlotTracker) const {
1281 O << Indent << "WIDEN-PHI ";
1282
1283 auto *OriginalPhi = cast<PHINode>(getUnderlyingValue());
1284 // Unless all incoming values are modeled in VPlan print the original PHI
1285 // directly.
1286 // TODO: Remove once all VPWidenPHIRecipe instances keep all relevant incoming
1287 // values as VPValues.
1288 if (getNumOperands() != OriginalPhi->getNumOperands()) {
1289 O << VPlanIngredient(OriginalPhi);
1290 return;
1291 }
1292
1293 printAsOperand(O, SlotTracker);
1294 O << " = phi ";
1295 printOperands(O, SlotTracker);
1296 }
1297 #endif
1298
1299 // TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1300 // remove VPActiveLaneMaskPHIRecipe.
execute(VPTransformState & State)1301 void VPActiveLaneMaskPHIRecipe::execute(VPTransformState &State) {
1302 BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
1303 for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
1304 Value *StartMask = State.get(getOperand(0), Part);
1305 PHINode *EntryPart =
1306 State.Builder.CreatePHI(StartMask->getType(), 2, "active.lane.mask");
1307 EntryPart->addIncoming(StartMask, VectorPH);
1308 EntryPart->setDebugLoc(DL);
1309 State.set(this, EntryPart, Part);
1310 }
1311 }
1312
1313 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
print(raw_ostream & O,const Twine & Indent,VPSlotTracker & SlotTracker) const1314 void VPActiveLaneMaskPHIRecipe::print(raw_ostream &O, const Twine &Indent,
1315 VPSlotTracker &SlotTracker) const {
1316 O << Indent << "ACTIVE-LANE-MASK-PHI ";
1317
1318 printAsOperand(O, SlotTracker);
1319 O << " = phi ";
1320 printOperands(O, SlotTracker);
1321 }
1322 #endif
1323