1 //===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file contains the declarations of the Vectorization Plan base classes:
11 /// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
12 ///    VPBlockBase, together implementing a Hierarchical CFG;
13 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained
14 ///    within VPBasicBlocks;
15 /// 3. VPInstruction, a concrete Recipe and VPUser modeling a single planned
16 ///    instruction;
17 /// 4. The VPlan class holding a candidate for vectorization;
18 /// 5. The VPlanPrinter class providing a way to print a plan in dot format;
19 /// These are documented in docs/VectorizationPlan.rst.
20 //
21 //===----------------------------------------------------------------------===//
22 
23 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
24 #define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
25 
26 #include "VPlanValue.h"
27 #include "llvm/ADT/DenseMap.h"
28 #include "llvm/ADT/DepthFirstIterator.h"
29 #include "llvm/ADT/MapVector.h"
30 #include "llvm/ADT/SmallBitVector.h"
31 #include "llvm/ADT/SmallPtrSet.h"
32 #include "llvm/ADT/SmallVector.h"
33 #include "llvm/ADT/Twine.h"
34 #include "llvm/ADT/ilist.h"
35 #include "llvm/ADT/ilist_node.h"
36 #include "llvm/Analysis/LoopInfo.h"
37 #include "llvm/Analysis/VectorUtils.h"
38 #include "llvm/IR/DebugLoc.h"
39 #include "llvm/IR/FMF.h"
40 #include "llvm/Transforms/Utils/LoopVersioning.h"
41 #include <algorithm>
42 #include <cassert>
43 #include <cstddef>
44 #include <string>
45 
46 namespace llvm {
47 
48 class BasicBlock;
49 class DominatorTree;
50 class InductionDescriptor;
51 class InnerLoopVectorizer;
52 class IRBuilderBase;
53 class LoopInfo;
54 class PredicateScalarEvolution;
55 class raw_ostream;
56 class RecurrenceDescriptor;
57 class SCEV;
58 class Type;
59 class VPBasicBlock;
60 class VPRegionBlock;
61 class VPlan;
62 class VPReplicateRecipe;
63 class VPlanSlp;
64 class Value;
65 
66 namespace Intrinsic {
67 typedef unsigned ID;
68 }
69 
70 /// Returns a calculation for the total number of elements for a given \p VF.
71 /// For fixed width vectors this value is a constant, whereas for scalable
72 /// vectors it is an expression determined at runtime.
73 Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
74 
75 /// Return a value for Step multiplied by VF.
76 Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
77                        int64_t Step);
78 
79 const SCEV *createTripCountSCEV(Type *IdxTy, PredicatedScalarEvolution &PSE);
80 
81 /// A range of powers-of-2 vectorization factors with fixed start and
82 /// adjustable end. The range includes start and excludes end, e.g.,:
83 /// [1, 9) = {1, 2, 4, 8}
84 struct VFRange {
85   // A power of 2.
86   const ElementCount Start;
87 
88   // Need not be a power of 2. If End <= Start range is empty.
89   ElementCount End;
90 
91   bool isEmpty() const {
92     return End.getKnownMinValue() <= Start.getKnownMinValue();
93   }
94 
95   VFRange(const ElementCount &Start, const ElementCount &End)
96       : Start(Start), End(End) {
97     assert(Start.isScalable() == End.isScalable() &&
98            "Both Start and End should have the same scalable flag");
99     assert(isPowerOf2_32(Start.getKnownMinValue()) &&
100            "Expected Start to be a power of 2");
101   }
102 };
103 
104 using VPlanPtr = std::unique_ptr<VPlan>;
105 
106 /// In what follows, the term "input IR" refers to code that is fed into the
107 /// vectorizer whereas the term "output IR" refers to code that is generated by
108 /// the vectorizer.
109 
110 /// VPLane provides a way to access lanes in both fixed width and scalable
111 /// vectors, where for the latter the lane index sometimes needs calculating
112 /// as a runtime expression.
113 class VPLane {
114 public:
115   /// Kind describes how to interpret Lane.
116   enum class Kind : uint8_t {
117     /// For First, Lane is the index into the first N elements of a
118     /// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
119     First,
120     /// For ScalableLast, Lane is the offset from the start of the last
121     /// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
122     /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
123     /// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
124     ScalableLast
125   };
126 
127 private:
128   /// in [0..VF)
129   unsigned Lane;
130 
131   /// Indicates how the Lane should be interpreted, as described above.
132   Kind LaneKind;
133 
134 public:
135   VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
136 
137   static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
138 
139   static VPLane getLastLaneForVF(const ElementCount &VF) {
140     unsigned LaneOffset = VF.getKnownMinValue() - 1;
141     Kind LaneKind;
142     if (VF.isScalable())
143       // In this case 'LaneOffset' refers to the offset from the start of the
144       // last subvector with VF.getKnownMinValue() elements.
145       LaneKind = VPLane::Kind::ScalableLast;
146     else
147       LaneKind = VPLane::Kind::First;
148     return VPLane(LaneOffset, LaneKind);
149   }
150 
151   /// Returns a compile-time known value for the lane index and asserts if the
152   /// lane can only be calculated at runtime.
153   unsigned getKnownLane() const {
154     assert(LaneKind == Kind::First);
155     return Lane;
156   }
157 
158   /// Returns an expression describing the lane index that can be used at
159   /// runtime.
160   Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const;
161 
162   /// Returns the Kind of lane offset.
163   Kind getKind() const { return LaneKind; }
164 
165   /// Returns true if this is the first lane of the whole vector.
166   bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
167 
168   /// Maps the lane to a cache index based on \p VF.
169   unsigned mapToCacheIndex(const ElementCount &VF) const {
170     switch (LaneKind) {
171     case VPLane::Kind::ScalableLast:
172       assert(VF.isScalable() && Lane < VF.getKnownMinValue());
173       return VF.getKnownMinValue() + Lane;
174     default:
175       assert(Lane < VF.getKnownMinValue());
176       return Lane;
177     }
178   }
179 
180   /// Returns the maxmimum number of lanes that we are able to consider
181   /// caching for \p VF.
182   static unsigned getNumCachedLanes(const ElementCount &VF) {
183     return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
184   }
185 };
186 
187 /// VPIteration represents a single point in the iteration space of the output
188 /// (vectorized and/or unrolled) IR loop.
189 struct VPIteration {
190   /// in [0..UF)
191   unsigned Part;
192 
193   VPLane Lane;
194 
195   VPIteration(unsigned Part, unsigned Lane,
196               VPLane::Kind Kind = VPLane::Kind::First)
197       : Part(Part), Lane(Lane, Kind) {}
198 
199   VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
200 
201   bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
202 };
203 
204 /// VPTransformState holds information passed down when "executing" a VPlan,
205 /// needed for generating the output IR.
206 struct VPTransformState {
207   VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
208                    DominatorTree *DT, IRBuilderBase &Builder,
209                    InnerLoopVectorizer *ILV, VPlan *Plan)
210       : VF(VF), UF(UF), LI(LI), DT(DT), Builder(Builder), ILV(ILV), Plan(Plan),
211         LVer(nullptr) {}
212 
213   /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
214   ElementCount VF;
215   unsigned UF;
216 
217   /// Hold the indices to generate specific scalar instructions. Null indicates
218   /// that all instances are to be generated, using either scalar or vector
219   /// instructions.
220   std::optional<VPIteration> Instance;
221 
222   struct DataState {
223     /// A type for vectorized values in the new loop. Each value from the
224     /// original loop, when vectorized, is represented by UF vector values in
225     /// the new unrolled loop, where UF is the unroll factor.
226     typedef SmallVector<Value *, 2> PerPartValuesTy;
227 
228     DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
229 
230     using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
231     DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
232   } Data;
233 
234   /// Get the generated Value for a given VPValue and a given Part. Note that
235   /// as some Defs are still created by ILV and managed in its ValueMap, this
236   /// method will delegate the call to ILV in such cases in order to provide
237   /// callers a consistent API.
238   /// \see set.
239   Value *get(VPValue *Def, unsigned Part);
240 
241   /// Get the generated Value for a given VPValue and given Part and Lane.
242   Value *get(VPValue *Def, const VPIteration &Instance);
243 
244   bool hasVectorValue(VPValue *Def, unsigned Part) {
245     auto I = Data.PerPartOutput.find(Def);
246     return I != Data.PerPartOutput.end() && Part < I->second.size() &&
247            I->second[Part];
248   }
249 
250   bool hasAnyVectorValue(VPValue *Def) const {
251     return Data.PerPartOutput.find(Def) != Data.PerPartOutput.end();
252   }
253 
254   bool hasScalarValue(VPValue *Def, VPIteration Instance) {
255     auto I = Data.PerPartScalars.find(Def);
256     if (I == Data.PerPartScalars.end())
257       return false;
258     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
259     return Instance.Part < I->second.size() &&
260            CacheIdx < I->second[Instance.Part].size() &&
261            I->second[Instance.Part][CacheIdx];
262   }
263 
264   /// Set the generated Value for a given VPValue and a given Part.
265   void set(VPValue *Def, Value *V, unsigned Part) {
266     if (!Data.PerPartOutput.count(Def)) {
267       DataState::PerPartValuesTy Entry(UF);
268       Data.PerPartOutput[Def] = Entry;
269     }
270     Data.PerPartOutput[Def][Part] = V;
271   }
272   /// Reset an existing vector value for \p Def and a given \p Part.
273   void reset(VPValue *Def, Value *V, unsigned Part) {
274     auto Iter = Data.PerPartOutput.find(Def);
275     assert(Iter != Data.PerPartOutput.end() &&
276            "need to overwrite existing value");
277     Iter->second[Part] = V;
278   }
279 
280   /// Set the generated scalar \p V for \p Def and the given \p Instance.
281   void set(VPValue *Def, Value *V, const VPIteration &Instance) {
282     auto Iter = Data.PerPartScalars.insert({Def, {}});
283     auto &PerPartVec = Iter.first->second;
284     while (PerPartVec.size() <= Instance.Part)
285       PerPartVec.emplace_back();
286     auto &Scalars = PerPartVec[Instance.Part];
287     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
288     while (Scalars.size() <= CacheIdx)
289       Scalars.push_back(nullptr);
290     assert(!Scalars[CacheIdx] && "should overwrite existing value");
291     Scalars[CacheIdx] = V;
292   }
293 
294   /// Reset an existing scalar value for \p Def and a given \p Instance.
295   void reset(VPValue *Def, Value *V, const VPIteration &Instance) {
296     auto Iter = Data.PerPartScalars.find(Def);
297     assert(Iter != Data.PerPartScalars.end() &&
298            "need to overwrite existing value");
299     assert(Instance.Part < Iter->second.size() &&
300            "need to overwrite existing value");
301     unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
302     assert(CacheIdx < Iter->second[Instance.Part].size() &&
303            "need to overwrite existing value");
304     Iter->second[Instance.Part][CacheIdx] = V;
305   }
306 
307   /// Add additional metadata to \p To that was not present on \p Orig.
308   ///
309   /// Currently this is used to add the noalias annotations based on the
310   /// inserted memchecks.  Use this for instructions that are *cloned* into the
311   /// vector loop.
312   void addNewMetadata(Instruction *To, const Instruction *Orig);
313 
314   /// Add metadata from one instruction to another.
315   ///
316   /// This includes both the original MDs from \p From and additional ones (\see
317   /// addNewMetadata).  Use this for *newly created* instructions in the vector
318   /// loop.
319   void addMetadata(Instruction *To, Instruction *From);
320 
321   /// Similar to the previous function but it adds the metadata to a
322   /// vector of instructions.
323   void addMetadata(ArrayRef<Value *> To, Instruction *From);
324 
325   /// Set the debug location in the builder using the debug location in \p V.
326   void setDebugLocFromInst(const Value *V);
327 
328   /// Hold state information used when constructing the CFG of the output IR,
329   /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
330   struct CFGState {
331     /// The previous VPBasicBlock visited. Initially set to null.
332     VPBasicBlock *PrevVPBB = nullptr;
333 
334     /// The previous IR BasicBlock created or used. Initially set to the new
335     /// header BasicBlock.
336     BasicBlock *PrevBB = nullptr;
337 
338     /// The last IR BasicBlock in the output IR. Set to the exit block of the
339     /// vector loop.
340     BasicBlock *ExitBB = nullptr;
341 
342     /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
343     /// of replication, maps the BasicBlock of the last replica created.
344     SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
345 
346     CFGState() = default;
347 
348     /// Returns the BasicBlock* mapped to the pre-header of the loop region
349     /// containing \p R.
350     BasicBlock *getPreheaderBBFor(VPRecipeBase *R);
351   } CFG;
352 
353   /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
354   LoopInfo *LI;
355 
356   /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
357   DominatorTree *DT;
358 
359   /// Hold a reference to the IRBuilder used to generate output IR code.
360   IRBuilderBase &Builder;
361 
362   VPValue2ValueTy VPValue2Value;
363 
364   /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
365   Value *CanonicalIV = nullptr;
366 
367   /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
368   InnerLoopVectorizer *ILV;
369 
370   /// Pointer to the VPlan code is generated for.
371   VPlan *Plan;
372 
373   /// Holds recipes that may generate a poison value that is used after
374   /// vectorization, even when their operands are not poison.
375   SmallPtrSet<VPRecipeBase *, 16> MayGeneratePoisonRecipes;
376 
377   /// The loop object for the current parent region, or nullptr.
378   Loop *CurrentVectorLoop = nullptr;
379 
380   /// LoopVersioning.  It's only set up (non-null) if memchecks were
381   /// used.
382   ///
383   /// This is currently only used to add no-alias metadata based on the
384   /// memchecks.  The actually versioning is performed manually.
385   std::unique_ptr<LoopVersioning> LVer;
386 };
387 
388 /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
389 /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
390 class VPBlockBase {
391   friend class VPBlockUtils;
392 
393   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
394 
395   /// An optional name for the block.
396   std::string Name;
397 
398   /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
399   /// it is a topmost VPBlockBase.
400   VPRegionBlock *Parent = nullptr;
401 
402   /// List of predecessor blocks.
403   SmallVector<VPBlockBase *, 1> Predecessors;
404 
405   /// List of successor blocks.
406   SmallVector<VPBlockBase *, 1> Successors;
407 
408   /// VPlan containing the block. Can only be set on the entry block of the
409   /// plan.
410   VPlan *Plan = nullptr;
411 
412   /// Add \p Successor as the last successor to this block.
413   void appendSuccessor(VPBlockBase *Successor) {
414     assert(Successor && "Cannot add nullptr successor!");
415     Successors.push_back(Successor);
416   }
417 
418   /// Add \p Predecessor as the last predecessor to this block.
419   void appendPredecessor(VPBlockBase *Predecessor) {
420     assert(Predecessor && "Cannot add nullptr predecessor!");
421     Predecessors.push_back(Predecessor);
422   }
423 
424   /// Remove \p Predecessor from the predecessors of this block.
425   void removePredecessor(VPBlockBase *Predecessor) {
426     auto Pos = find(Predecessors, Predecessor);
427     assert(Pos && "Predecessor does not exist");
428     Predecessors.erase(Pos);
429   }
430 
431   /// Remove \p Successor from the successors of this block.
432   void removeSuccessor(VPBlockBase *Successor) {
433     auto Pos = find(Successors, Successor);
434     assert(Pos && "Successor does not exist");
435     Successors.erase(Pos);
436   }
437 
438 protected:
439   VPBlockBase(const unsigned char SC, const std::string &N)
440       : SubclassID(SC), Name(N) {}
441 
442 public:
443   /// An enumeration for keeping track of the concrete subclass of VPBlockBase
444   /// that are actually instantiated. Values of this enumeration are kept in the
445   /// SubclassID field of the VPBlockBase objects. They are used for concrete
446   /// type identification.
447   using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
448 
449   using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
450 
451   virtual ~VPBlockBase() = default;
452 
453   const std::string &getName() const { return Name; }
454 
455   void setName(const Twine &newName) { Name = newName.str(); }
456 
457   /// \return an ID for the concrete type of this object.
458   /// This is used to implement the classof checks. This should not be used
459   /// for any other purpose, as the values may change as LLVM evolves.
460   unsigned getVPBlockID() const { return SubclassID; }
461 
462   VPRegionBlock *getParent() { return Parent; }
463   const VPRegionBlock *getParent() const { return Parent; }
464 
465   /// \return A pointer to the plan containing the current block.
466   VPlan *getPlan();
467   const VPlan *getPlan() const;
468 
469   /// Sets the pointer of the plan containing the block. The block must be the
470   /// entry block into the VPlan.
471   void setPlan(VPlan *ParentPlan);
472 
473   void setParent(VPRegionBlock *P) { Parent = P; }
474 
475   /// \return the VPBasicBlock that is the entry of this VPBlockBase,
476   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
477   /// VPBlockBase is a VPBasicBlock, it is returned.
478   const VPBasicBlock *getEntryBasicBlock() const;
479   VPBasicBlock *getEntryBasicBlock();
480 
481   /// \return the VPBasicBlock that is the exiting this VPBlockBase,
482   /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
483   /// VPBlockBase is a VPBasicBlock, it is returned.
484   const VPBasicBlock *getExitingBasicBlock() const;
485   VPBasicBlock *getExitingBasicBlock();
486 
487   const VPBlocksTy &getSuccessors() const { return Successors; }
488   VPBlocksTy &getSuccessors() { return Successors; }
489 
490   iterator_range<VPBlockBase **> successors() { return Successors; }
491 
492   const VPBlocksTy &getPredecessors() const { return Predecessors; }
493   VPBlocksTy &getPredecessors() { return Predecessors; }
494 
495   /// \return the successor of this VPBlockBase if it has a single successor.
496   /// Otherwise return a null pointer.
497   VPBlockBase *getSingleSuccessor() const {
498     return (Successors.size() == 1 ? *Successors.begin() : nullptr);
499   }
500 
501   /// \return the predecessor of this VPBlockBase if it has a single
502   /// predecessor. Otherwise return a null pointer.
503   VPBlockBase *getSinglePredecessor() const {
504     return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
505   }
506 
507   size_t getNumSuccessors() const { return Successors.size(); }
508   size_t getNumPredecessors() const { return Predecessors.size(); }
509 
510   /// An Enclosing Block of a block B is any block containing B, including B
511   /// itself. \return the closest enclosing block starting from "this", which
512   /// has successors. \return the root enclosing block if all enclosing blocks
513   /// have no successors.
514   VPBlockBase *getEnclosingBlockWithSuccessors();
515 
516   /// \return the closest enclosing block starting from "this", which has
517   /// predecessors. \return the root enclosing block if all enclosing blocks
518   /// have no predecessors.
519   VPBlockBase *getEnclosingBlockWithPredecessors();
520 
521   /// \return the successors either attached directly to this VPBlockBase or, if
522   /// this VPBlockBase is the exit block of a VPRegionBlock and has no
523   /// successors of its own, search recursively for the first enclosing
524   /// VPRegionBlock that has successors and return them. If no such
525   /// VPRegionBlock exists, return the (empty) successors of the topmost
526   /// VPBlockBase reached.
527   const VPBlocksTy &getHierarchicalSuccessors() {
528     return getEnclosingBlockWithSuccessors()->getSuccessors();
529   }
530 
531   /// \return the hierarchical successor of this VPBlockBase if it has a single
532   /// hierarchical successor. Otherwise return a null pointer.
533   VPBlockBase *getSingleHierarchicalSuccessor() {
534     return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
535   }
536 
537   /// \return the predecessors either attached directly to this VPBlockBase or,
538   /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
539   /// predecessors of its own, search recursively for the first enclosing
540   /// VPRegionBlock that has predecessors and return them. If no such
541   /// VPRegionBlock exists, return the (empty) predecessors of the topmost
542   /// VPBlockBase reached.
543   const VPBlocksTy &getHierarchicalPredecessors() {
544     return getEnclosingBlockWithPredecessors()->getPredecessors();
545   }
546 
547   /// \return the hierarchical predecessor of this VPBlockBase if it has a
548   /// single hierarchical predecessor. Otherwise return a null pointer.
549   VPBlockBase *getSingleHierarchicalPredecessor() {
550     return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
551   }
552 
553   /// Set a given VPBlockBase \p Successor as the single successor of this
554   /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
555   /// This VPBlockBase must have no successors.
556   void setOneSuccessor(VPBlockBase *Successor) {
557     assert(Successors.empty() && "Setting one successor when others exist.");
558     appendSuccessor(Successor);
559   }
560 
561   /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
562   /// successors of this VPBlockBase. This VPBlockBase is not added as
563   /// predecessor of \p IfTrue or \p IfFalse. This VPBlockBase must have no
564   /// successors.
565   void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse) {
566     assert(Successors.empty() && "Setting two successors when others exist.");
567     appendSuccessor(IfTrue);
568     appendSuccessor(IfFalse);
569   }
570 
571   /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
572   /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
573   /// as successor of any VPBasicBlock in \p NewPreds.
574   void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
575     assert(Predecessors.empty() && "Block predecessors already set.");
576     for (auto *Pred : NewPreds)
577       appendPredecessor(Pred);
578   }
579 
580   /// Remove all the predecessor of this block.
581   void clearPredecessors() { Predecessors.clear(); }
582 
583   /// Remove all the successors of this block.
584   void clearSuccessors() { Successors.clear(); }
585 
586   /// The method which generates the output IR that correspond to this
587   /// VPBlockBase, thereby "executing" the VPlan.
588   virtual void execute(VPTransformState *State) = 0;
589 
590   /// Delete all blocks reachable from a given VPBlockBase, inclusive.
591   static void deleteCFG(VPBlockBase *Entry);
592 
593   /// Return true if it is legal to hoist instructions into this block.
594   bool isLegalToHoistInto() {
595     // There are currently no constraints that prevent an instruction to be
596     // hoisted into a VPBlockBase.
597     return true;
598   }
599 
600   /// Replace all operands of VPUsers in the block with \p NewValue and also
601   /// replaces all uses of VPValues defined in the block with NewValue.
602   virtual void dropAllReferences(VPValue *NewValue) = 0;
603 
604 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
605   void printAsOperand(raw_ostream &OS, bool PrintType) const {
606     OS << getName();
607   }
608 
609   /// Print plain-text dump of this VPBlockBase to \p O, prefixing all lines
610   /// with \p Indent. \p SlotTracker is used to print unnamed VPValue's using
611   /// consequtive numbers.
612   ///
613   /// Note that the numbering is applied to the whole VPlan, so printing
614   /// individual blocks is consistent with the whole VPlan printing.
615   virtual void print(raw_ostream &O, const Twine &Indent,
616                      VPSlotTracker &SlotTracker) const = 0;
617 
618   /// Print plain-text dump of this VPlan to \p O.
619   void print(raw_ostream &O) const {
620     VPSlotTracker SlotTracker(getPlan());
621     print(O, "", SlotTracker);
622   }
623 
624   /// Print the successors of this block to \p O, prefixing all lines with \p
625   /// Indent.
626   void printSuccessors(raw_ostream &O, const Twine &Indent) const;
627 
628   /// Dump this VPBlockBase to dbgs().
629   LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
630 #endif
631 };
632 
633 /// A value that is used outside the VPlan. The operand of the user needs to be
634 /// added to the associated LCSSA phi node.
635 class VPLiveOut : public VPUser {
636   PHINode *Phi;
637 
638 public:
639   VPLiveOut(PHINode *Phi, VPValue *Op)
640       : VPUser({Op}, VPUser::VPUserID::LiveOut), Phi(Phi) {}
641 
642   /// Fixup the wrapped LCSSA phi node in the unique exit block.  This simply
643   /// means we need to add the appropriate incoming value from the middle
644   /// block as exiting edges from the scalar epilogue loop (if present) are
645   /// already in place, and we exit the vector loop exclusively to the middle
646   /// block.
647   void fixPhi(VPlan &Plan, VPTransformState &State);
648 
649   /// Returns true if the VPLiveOut uses scalars of operand \p Op.
650   bool usesScalars(const VPValue *Op) const override {
651     assert(is_contained(operands(), Op) &&
652            "Op must be an operand of the recipe");
653     return true;
654   }
655 
656   PHINode *getPhi() const { return Phi; }
657 };
658 
659 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
660 /// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
661 /// and is responsible for deleting its defined values. Single-value
662 /// VPRecipeBases that also inherit from VPValue must make sure to inherit from
663 /// VPRecipeBase before VPValue.
664 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
665                      public VPDef,
666                      public VPUser {
667   friend VPBasicBlock;
668   friend class VPBlockUtils;
669 
670   /// Each VPRecipe belongs to a single VPBasicBlock.
671   VPBasicBlock *Parent = nullptr;
672 
673 public:
674   VPRecipeBase(const unsigned char SC, ArrayRef<VPValue *> Operands)
675       : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
676 
677   template <typename IterT>
678   VPRecipeBase(const unsigned char SC, iterator_range<IterT> Operands)
679       : VPDef(SC), VPUser(Operands, VPUser::VPUserID::Recipe) {}
680   virtual ~VPRecipeBase() = default;
681 
682   /// \return the VPBasicBlock which this VPRecipe belongs to.
683   VPBasicBlock *getParent() { return Parent; }
684   const VPBasicBlock *getParent() const { return Parent; }
685 
686   /// The method which generates the output IR instructions that correspond to
687   /// this VPRecipe, thereby "executing" the VPlan.
688   virtual void execute(VPTransformState &State) = 0;
689 
690   /// Insert an unlinked recipe into a basic block immediately before
691   /// the specified recipe.
692   void insertBefore(VPRecipeBase *InsertPos);
693   /// Insert an unlinked recipe into \p BB immediately before the insertion
694   /// point \p IP;
695   void insertBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator IP);
696 
697   /// Insert an unlinked Recipe into a basic block immediately after
698   /// the specified Recipe.
699   void insertAfter(VPRecipeBase *InsertPos);
700 
701   /// Unlink this recipe from its current VPBasicBlock and insert it into
702   /// the VPBasicBlock that MovePos lives in, right after MovePos.
703   void moveAfter(VPRecipeBase *MovePos);
704 
705   /// Unlink this recipe and insert into BB before I.
706   ///
707   /// \pre I is a valid iterator into BB.
708   void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
709 
710   /// This method unlinks 'this' from the containing basic block, but does not
711   /// delete it.
712   void removeFromParent();
713 
714   /// This method unlinks 'this' from the containing basic block and deletes it.
715   ///
716   /// \returns an iterator pointing to the element after the erased one
717   iplist<VPRecipeBase>::iterator eraseFromParent();
718 
719   /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
720   /// otherwise.
721   Instruction *getUnderlyingInstr() {
722     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
723   }
724   const Instruction *getUnderlyingInstr() const {
725     return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
726   }
727 
728   /// Method to support type inquiry through isa, cast, and dyn_cast.
729   static inline bool classof(const VPDef *D) {
730     // All VPDefs are also VPRecipeBases.
731     return true;
732   }
733 
734   static inline bool classof(const VPUser *U) {
735     return U->getVPUserID() == VPUser::VPUserID::Recipe;
736   }
737 
738   /// Returns true if the recipe may have side-effects.
739   bool mayHaveSideEffects() const;
740 
741   /// Returns true for PHI-like recipes.
742   bool isPhi() const {
743     return getVPDefID() >= VPFirstPHISC && getVPDefID() <= VPLastPHISC;
744   }
745 
746   /// Returns true if the recipe may read from memory.
747   bool mayReadFromMemory() const;
748 
749   /// Returns true if the recipe may write to memory.
750   bool mayWriteToMemory() const;
751 
752   /// Returns true if the recipe may read from or write to memory.
753   bool mayReadOrWriteMemory() const {
754     return mayReadFromMemory() || mayWriteToMemory();
755   }
756 };
757 
758 // Helper macro to define common classof implementations for recipes.
759 #define VP_CLASSOF_IMPL(VPDefID)                                               \
760   static inline bool classof(const VPDef *D) {                                 \
761     return D->getVPDefID() == VPDefID;                                         \
762   }                                                                            \
763   static inline bool classof(const VPValue *V) {                               \
764     auto *R = V->getDefiningRecipe();                                          \
765     return R && R->getVPDefID() == VPDefID;                                    \
766   }                                                                            \
767   static inline bool classof(const VPUser *U) {                                \
768     auto *R = dyn_cast<VPRecipeBase>(U);                                       \
769     return R && R->getVPDefID() == VPDefID;                                    \
770   }                                                                            \
771   static inline bool classof(const VPRecipeBase *R) {                          \
772     return R->getVPDefID() == VPDefID;                                         \
773   }
774 
775 /// This is a concrete Recipe that models a single VPlan-level instruction.
776 /// While as any Recipe it may generate a sequence of IR instructions when
777 /// executed, these instructions would always form a single-def expression as
778 /// the VPInstruction is also a single def-use vertex.
779 class VPInstruction : public VPRecipeBase, public VPValue {
780   friend class VPlanSlp;
781 
782 public:
783   /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
784   enum {
785     FirstOrderRecurrenceSplice =
786         Instruction::OtherOpsEnd + 1, // Combines the incoming and previous
787                                       // values of a first-order recurrence.
788     Not,
789     ICmpULE,
790     SLPLoad,
791     SLPStore,
792     ActiveLaneMask,
793     CanonicalIVIncrement,
794     CanonicalIVIncrementNUW,
795     // The next two are similar to the above, but instead increment the
796     // canonical IV separately for each unrolled part.
797     CanonicalIVIncrementForPart,
798     CanonicalIVIncrementForPartNUW,
799     BranchOnCount,
800     BranchOnCond
801   };
802 
803 private:
804   typedef unsigned char OpcodeTy;
805   OpcodeTy Opcode;
806   FastMathFlags FMF;
807   DebugLoc DL;
808 
809   /// An optional name that can be used for the generated IR instruction.
810   const std::string Name;
811 
812   /// Utility method serving execute(): generates a single instance of the
813   /// modeled instruction.
814   void generateInstruction(VPTransformState &State, unsigned Part);
815 
816 protected:
817   void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
818 
819 public:
820   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
821                 const Twine &Name = "")
822       : VPRecipeBase(VPDef::VPInstructionSC, Operands), VPValue(this),
823         Opcode(Opcode), DL(DL), Name(Name.str()) {}
824 
825   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
826                 DebugLoc DL = {}, const Twine &Name = "")
827       : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands), DL, Name) {}
828 
829   VP_CLASSOF_IMPL(VPDef::VPInstructionSC)
830 
831   VPInstruction *clone() const {
832     SmallVector<VPValue *, 2> Operands(operands());
833     return new VPInstruction(Opcode, Operands, DL, Name);
834   }
835 
836   unsigned getOpcode() const { return Opcode; }
837 
838   /// Generate the instruction.
839   /// TODO: We currently execute only per-part unless a specific instance is
840   /// provided.
841   void execute(VPTransformState &State) override;
842 
843 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
844   /// Print the VPInstruction to \p O.
845   void print(raw_ostream &O, const Twine &Indent,
846              VPSlotTracker &SlotTracker) const override;
847 
848   /// Print the VPInstruction to dbgs() (for debugging).
849   LLVM_DUMP_METHOD void dump() const;
850 #endif
851 
852   /// Return true if this instruction may modify memory.
853   bool mayWriteToMemory() const {
854     // TODO: we can use attributes of the called function to rule out memory
855     //       modifications.
856     return Opcode == Instruction::Store || Opcode == Instruction::Call ||
857            Opcode == Instruction::Invoke || Opcode == SLPStore;
858   }
859 
860   bool hasResult() const {
861     // CallInst may or may not have a result, depending on the called function.
862     // Conservatively return calls have results for now.
863     switch (getOpcode()) {
864     case Instruction::Ret:
865     case Instruction::Br:
866     case Instruction::Store:
867     case Instruction::Switch:
868     case Instruction::IndirectBr:
869     case Instruction::Resume:
870     case Instruction::CatchRet:
871     case Instruction::Unreachable:
872     case Instruction::Fence:
873     case Instruction::AtomicRMW:
874     case VPInstruction::BranchOnCond:
875     case VPInstruction::BranchOnCount:
876       return false;
877     default:
878       return true;
879     }
880   }
881 
882   /// Set the fast-math flags.
883   void setFastMathFlags(FastMathFlags FMFNew);
884 
885   /// Returns true if the recipe only uses the first lane of operand \p Op.
886   bool onlyFirstLaneUsed(const VPValue *Op) const override {
887     assert(is_contained(operands(), Op) &&
888            "Op must be an operand of the recipe");
889     if (getOperand(0) != Op)
890       return false;
891     switch (getOpcode()) {
892     default:
893       return false;
894     case VPInstruction::ActiveLaneMask:
895     case VPInstruction::CanonicalIVIncrement:
896     case VPInstruction::CanonicalIVIncrementNUW:
897     case VPInstruction::CanonicalIVIncrementForPart:
898     case VPInstruction::CanonicalIVIncrementForPartNUW:
899     case VPInstruction::BranchOnCount:
900       return true;
901     };
902     llvm_unreachable("switch should return");
903   }
904 };
905 
906 /// VPWidenRecipe is a recipe for producing a copy of vector type its
907 /// ingredient. This recipe covers most of the traditional vectorization cases
908 /// where each ingredient transforms into a vectorized version of itself.
909 class VPWidenRecipe : public VPRecipeBase, public VPValue {
910 public:
911   template <typename IterT>
912   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
913       : VPRecipeBase(VPDef::VPWidenSC, Operands), VPValue(this, &I) {}
914 
915   ~VPWidenRecipe() override = default;
916 
917   VP_CLASSOF_IMPL(VPDef::VPWidenSC)
918 
919   /// Produce widened copies of all Ingredients.
920   void execute(VPTransformState &State) override;
921 
922 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
923   /// Print the recipe.
924   void print(raw_ostream &O, const Twine &Indent,
925              VPSlotTracker &SlotTracker) const override;
926 #endif
927 };
928 
929 /// A recipe for widening Call instructions.
930 class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
931   /// ID of the vector intrinsic to call when widening the call. If set the
932   /// Intrinsic::not_intrinsic, a library call will be used instead.
933   Intrinsic::ID VectorIntrinsicID;
934 
935 public:
936   template <typename IterT>
937   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
938                     Intrinsic::ID VectorIntrinsicID)
939       : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I),
940         VectorIntrinsicID(VectorIntrinsicID) {}
941 
942   ~VPWidenCallRecipe() override = default;
943 
944   VP_CLASSOF_IMPL(VPDef::VPWidenCallSC)
945 
946   /// Produce a widened version of the call instruction.
947   void execute(VPTransformState &State) override;
948 
949 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
950   /// Print the recipe.
951   void print(raw_ostream &O, const Twine &Indent,
952              VPSlotTracker &SlotTracker) const override;
953 #endif
954 };
955 
956 /// A recipe for widening select instructions.
957 class VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
958 
959   /// Is the condition of the select loop invariant?
960   bool InvariantCond;
961 
962 public:
963   template <typename IterT>
964   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
965                       bool InvariantCond)
966       : VPRecipeBase(VPDef::VPWidenSelectSC, Operands), VPValue(this, &I),
967         InvariantCond(InvariantCond) {}
968 
969   ~VPWidenSelectRecipe() override = default;
970 
971   VP_CLASSOF_IMPL(VPDef::VPWidenSelectSC)
972 
973   /// Produce a widened version of the select instruction.
974   void execute(VPTransformState &State) override;
975 
976 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
977   /// Print the recipe.
978   void print(raw_ostream &O, const Twine &Indent,
979              VPSlotTracker &SlotTracker) const override;
980 #endif
981 };
982 
983 /// A recipe for handling GEP instructions.
984 class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
985   bool IsPtrLoopInvariant;
986   SmallBitVector IsIndexLoopInvariant;
987 
988 public:
989   template <typename IterT>
990   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
991       : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP),
992         IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
993 
994   template <typename IterT>
995   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
996                    Loop *OrigLoop)
997       : VPRecipeBase(VPDef::VPWidenGEPSC, Operands), VPValue(this, GEP),
998         IsIndexLoopInvariant(GEP->getNumIndices(), false) {
999     IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
1000     for (auto Index : enumerate(GEP->indices()))
1001       IsIndexLoopInvariant[Index.index()] =
1002           OrigLoop->isLoopInvariant(Index.value().get());
1003   }
1004   ~VPWidenGEPRecipe() override = default;
1005 
1006   VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC)
1007 
1008   /// Generate the gep nodes.
1009   void execute(VPTransformState &State) override;
1010 
1011 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1012   /// Print the recipe.
1013   void print(raw_ostream &O, const Twine &Indent,
1014              VPSlotTracker &SlotTracker) const override;
1015 #endif
1016 };
1017 
1018 /// A recipe for handling phi nodes of integer and floating-point inductions,
1019 /// producing their vector values.
1020 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
1021   PHINode *IV;
1022   const InductionDescriptor &IndDesc;
1023   bool NeedsVectorIV;
1024 
1025 public:
1026   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1027                                 const InductionDescriptor &IndDesc,
1028                                 bool NeedsVectorIV)
1029       : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
1030         VPValue(this, IV), IV(IV), IndDesc(IndDesc),
1031         NeedsVectorIV(NeedsVectorIV) {}
1032 
1033   VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start, VPValue *Step,
1034                                 const InductionDescriptor &IndDesc,
1035                                 TruncInst *Trunc, bool NeedsVectorIV)
1036       : VPRecipeBase(VPDef::VPWidenIntOrFpInductionSC, {Start, Step}),
1037         VPValue(this, Trunc), IV(IV), IndDesc(IndDesc),
1038         NeedsVectorIV(NeedsVectorIV) {}
1039 
1040   ~VPWidenIntOrFpInductionRecipe() override = default;
1041 
1042   VP_CLASSOF_IMPL(VPDef::VPWidenIntOrFpInductionSC)
1043 
1044   /// Generate the vectorized and scalarized versions of the phi node as
1045   /// needed by their users.
1046   void execute(VPTransformState &State) override;
1047 
1048 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1049   /// Print the recipe.
1050   void print(raw_ostream &O, const Twine &Indent,
1051              VPSlotTracker &SlotTracker) const override;
1052 #endif
1053 
1054   /// Returns the start value of the induction.
1055   VPValue *getStartValue() { return getOperand(0); }
1056   const VPValue *getStartValue() const { return getOperand(0); }
1057 
1058   /// Returns the step value of the induction.
1059   VPValue *getStepValue() { return getOperand(1); }
1060   const VPValue *getStepValue() const { return getOperand(1); }
1061 
1062   /// Returns the first defined value as TruncInst, if it is one or nullptr
1063   /// otherwise.
1064   TruncInst *getTruncInst() {
1065     return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
1066   }
1067   const TruncInst *getTruncInst() const {
1068     return dyn_cast_or_null<TruncInst>(getVPValue(0)->getUnderlyingValue());
1069   }
1070 
1071   PHINode *getPHINode() { return IV; }
1072 
1073   /// Returns the induction descriptor for the recipe.
1074   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1075 
1076   /// Returns true if the induction is canonical, i.e. starting at 0 and
1077   /// incremented by UF * VF (= the original IV is incremented by 1).
1078   bool isCanonical() const;
1079 
1080   /// Returns the scalar type of the induction.
1081   const Type *getScalarType() const {
1082     const TruncInst *TruncI = getTruncInst();
1083     return TruncI ? TruncI->getType() : IV->getType();
1084   }
1085 
1086   /// Returns true if a vector phi needs to be created for the induction.
1087   bool needsVectorIV() const { return NeedsVectorIV; }
1088 };
1089 
1090 /// A pure virtual base class for all recipes modeling header phis, including
1091 /// phis for first order recurrences, pointer inductions and reductions. The
1092 /// start value is the first operand of the recipe and the incoming value from
1093 /// the backedge is the second operand.
1094 ///
1095 /// Inductions are modeled using the following sub-classes:
1096 ///  * VPCanonicalIVPHIRecipe: Canonical scalar induction of the vector loop,
1097 ///    starting at a specified value (zero for the main vector loop, the resume
1098 ///    value for the epilogue vector loop) and stepping by 1. The induction
1099 ///    controls exiting of the vector loop by comparing against the vector trip
1100 ///    count. Produces a single scalar PHI for the induction value per
1101 ///    iteration.
1102 ///  * VPWidenIntOrFpInductionRecipe: Generates vector values for integer and
1103 ///    floating point inductions with arbitrary start and step values. Produces
1104 ///    a vector PHI per-part.
1105 ///  * VPDerivedIVRecipe: Converts the canonical IV value to the corresponding
1106 ///    value of an IV with different start and step values. Produces a single
1107 ///    scalar value per iteration
1108 ///  * VPScalarIVStepsRecipe: Generates scalar values per-lane based on a
1109 ///    canonical or derived induction.
1110 ///  * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
1111 ///    pointer induction. Produces either a vector PHI per-part or scalar values
1112 ///    per-lane based on the canonical induction.
1113 class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
1114 protected:
1115   VPHeaderPHIRecipe(unsigned char VPDefID, PHINode *Phi,
1116                     VPValue *Start = nullptr)
1117       : VPRecipeBase(VPDefID, {}), VPValue(this, Phi) {
1118     if (Start)
1119       addOperand(Start);
1120   }
1121 
1122 public:
1123   ~VPHeaderPHIRecipe() override = default;
1124 
1125   /// Method to support type inquiry through isa, cast, and dyn_cast.
1126   static inline bool classof(const VPRecipeBase *B) {
1127     return B->getVPDefID() >= VPDef::VPFirstHeaderPHISC &&
1128            B->getVPDefID() <= VPDef::VPLastPHISC;
1129   }
1130   static inline bool classof(const VPValue *V) {
1131     auto *B = V->getDefiningRecipe();
1132     return B && B->getVPDefID() >= VPRecipeBase::VPFirstHeaderPHISC &&
1133            B->getVPDefID() <= VPRecipeBase::VPLastPHISC;
1134   }
1135 
1136   /// Generate the phi nodes.
1137   void execute(VPTransformState &State) override = 0;
1138 
1139 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1140   /// Print the recipe.
1141   void print(raw_ostream &O, const Twine &Indent,
1142              VPSlotTracker &SlotTracker) const override = 0;
1143 #endif
1144 
1145   /// Returns the start value of the phi, if one is set.
1146   VPValue *getStartValue() {
1147     return getNumOperands() == 0 ? nullptr : getOperand(0);
1148   }
1149   VPValue *getStartValue() const {
1150     return getNumOperands() == 0 ? nullptr : getOperand(0);
1151   }
1152 
1153   /// Update the start value of the recipe.
1154   void setStartValue(VPValue *V) { setOperand(0, V); }
1155 
1156   /// Returns the incoming value from the loop backedge.
1157   VPValue *getBackedgeValue() {
1158     return getOperand(1);
1159   }
1160 
1161   /// Returns the backedge value as a recipe. The backedge value is guaranteed
1162   /// to be a recipe.
1163   VPRecipeBase &getBackedgeRecipe() {
1164     return *getBackedgeValue()->getDefiningRecipe();
1165   }
1166 };
1167 
1168 class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
1169   const InductionDescriptor &IndDesc;
1170 
1171   bool IsScalarAfterVectorization;
1172 
1173 public:
1174   /// Create a new VPWidenPointerInductionRecipe for \p Phi with start value \p
1175   /// Start.
1176   VPWidenPointerInductionRecipe(PHINode *Phi, VPValue *Start, VPValue *Step,
1177                                 const InductionDescriptor &IndDesc,
1178                                 bool IsScalarAfterVectorization)
1179       : VPHeaderPHIRecipe(VPDef::VPWidenPointerInductionSC, Phi),
1180         IndDesc(IndDesc),
1181         IsScalarAfterVectorization(IsScalarAfterVectorization) {
1182     addOperand(Start);
1183     addOperand(Step);
1184   }
1185 
1186   ~VPWidenPointerInductionRecipe() override = default;
1187 
1188   VP_CLASSOF_IMPL(VPDef::VPWidenPointerInductionSC)
1189 
1190   /// Generate vector values for the pointer induction.
1191   void execute(VPTransformState &State) override;
1192 
1193   /// Returns true if only scalar values will be generated.
1194   bool onlyScalarsGenerated(ElementCount VF);
1195 
1196   /// Returns the induction descriptor for the recipe.
1197   const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }
1198 
1199 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1200   /// Print the recipe.
1201   void print(raw_ostream &O, const Twine &Indent,
1202              VPSlotTracker &SlotTracker) const override;
1203 #endif
1204 };
1205 
1206 /// A recipe for handling header phis that are widened in the vector loop.
1207 /// In the VPlan native path, all incoming VPValues & VPBasicBlock pairs are
1208 /// managed in the recipe directly.
1209 class VPWidenPHIRecipe : public VPHeaderPHIRecipe {
1210   /// List of incoming blocks. Only used in the VPlan native path.
1211   SmallVector<VPBasicBlock *, 2> IncomingBlocks;
1212 
1213 public:
1214   /// Create a new VPWidenPHIRecipe for \p Phi with start value \p Start.
1215   VPWidenPHIRecipe(PHINode *Phi, VPValue *Start = nullptr)
1216       : VPHeaderPHIRecipe(VPDef::VPWidenPHISC, Phi) {
1217     if (Start)
1218       addOperand(Start);
1219   }
1220 
1221   ~VPWidenPHIRecipe() override = default;
1222 
1223   VP_CLASSOF_IMPL(VPDef::VPWidenPHISC)
1224 
1225   /// Generate the phi/select nodes.
1226   void execute(VPTransformState &State) override;
1227 
1228 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1229   /// Print the recipe.
1230   void print(raw_ostream &O, const Twine &Indent,
1231              VPSlotTracker &SlotTracker) const override;
1232 #endif
1233 
1234   /// Adds a pair (\p IncomingV, \p IncomingBlock) to the phi.
1235   void addIncoming(VPValue *IncomingV, VPBasicBlock *IncomingBlock) {
1236     addOperand(IncomingV);
1237     IncomingBlocks.push_back(IncomingBlock);
1238   }
1239 
1240   /// Returns the \p I th incoming VPBasicBlock.
1241   VPBasicBlock *getIncomingBlock(unsigned I) { return IncomingBlocks[I]; }
1242 
1243   /// Returns the \p I th incoming VPValue.
1244   VPValue *getIncomingValue(unsigned I) { return getOperand(I); }
1245 };
1246 
1247 /// A recipe for handling first-order recurrence phis. The start value is the
1248 /// first operand of the recipe and the incoming value from the backedge is the
1249 /// second operand.
1250 struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
1251   VPFirstOrderRecurrencePHIRecipe(PHINode *Phi, VPValue &Start)
1252       : VPHeaderPHIRecipe(VPDef::VPFirstOrderRecurrencePHISC, Phi, &Start) {}
1253 
1254   VP_CLASSOF_IMPL(VPDef::VPFirstOrderRecurrencePHISC)
1255 
1256   static inline bool classof(const VPHeaderPHIRecipe *R) {
1257     return R->getVPDefID() == VPDef::VPFirstOrderRecurrencePHISC;
1258   }
1259 
1260   void execute(VPTransformState &State) override;
1261 
1262 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1263   /// Print the recipe.
1264   void print(raw_ostream &O, const Twine &Indent,
1265              VPSlotTracker &SlotTracker) const override;
1266 #endif
1267 };
1268 
1269 /// A recipe for handling reduction phis. The start value is the first operand
1270 /// of the recipe and the incoming value from the backedge is the second
1271 /// operand.
1272 class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
1273   /// Descriptor for the reduction.
1274   const RecurrenceDescriptor &RdxDesc;
1275 
1276   /// The phi is part of an in-loop reduction.
1277   bool IsInLoop;
1278 
1279   /// The phi is part of an ordered reduction. Requires IsInLoop to be true.
1280   bool IsOrdered;
1281 
1282 public:
1283   /// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
1284   /// RdxDesc.
1285   VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
1286                        VPValue &Start, bool IsInLoop = false,
1287                        bool IsOrdered = false)
1288       : VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
1289         RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
1290     assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
1291   }
1292 
1293   ~VPReductionPHIRecipe() override = default;
1294 
1295   VP_CLASSOF_IMPL(VPDef::VPReductionPHISC)
1296 
1297   static inline bool classof(const VPHeaderPHIRecipe *R) {
1298     return R->getVPDefID() == VPDef::VPReductionPHISC;
1299   }
1300 
1301   /// Generate the phi/select nodes.
1302   void execute(VPTransformState &State) override;
1303 
1304 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1305   /// Print the recipe.
1306   void print(raw_ostream &O, const Twine &Indent,
1307              VPSlotTracker &SlotTracker) const override;
1308 #endif
1309 
1310   const RecurrenceDescriptor &getRecurrenceDescriptor() const {
1311     return RdxDesc;
1312   }
1313 
1314   /// Returns true, if the phi is part of an ordered reduction.
1315   bool isOrdered() const { return IsOrdered; }
1316 
1317   /// Returns true, if the phi is part of an in-loop reduction.
1318   bool isInLoop() const { return IsInLoop; }
1319 };
1320 
1321 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
1322 /// instructions.
1323 class VPBlendRecipe : public VPRecipeBase, public VPValue {
1324   PHINode *Phi;
1325 
1326 public:
1327   /// The blend operation is a User of the incoming values and of their
1328   /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
1329   /// might be incoming with a full mask for which there is no VPValue.
1330   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
1331       : VPRecipeBase(VPDef::VPBlendSC, Operands), VPValue(this, Phi), Phi(Phi) {
1332     assert(Operands.size() > 0 &&
1333            ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
1334            "Expected either a single incoming value or a positive even number "
1335            "of operands");
1336   }
1337 
1338   VP_CLASSOF_IMPL(VPDef::VPBlendSC)
1339 
1340   /// Return the number of incoming values, taking into account that a single
1341   /// incoming value has no mask.
1342   unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
1343 
1344   /// Return incoming value number \p Idx.
1345   VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
1346 
1347   /// Return mask number \p Idx.
1348   VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
1349 
1350   /// Generate the phi/select nodes.
1351   void execute(VPTransformState &State) override;
1352 
1353 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1354   /// Print the recipe.
1355   void print(raw_ostream &O, const Twine &Indent,
1356              VPSlotTracker &SlotTracker) const override;
1357 #endif
1358 
1359   /// Returns true if the recipe only uses the first lane of operand \p Op.
1360   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1361     assert(is_contained(operands(), Op) &&
1362            "Op must be an operand of the recipe");
1363     // Recursing through Blend recipes only, must terminate at header phi's the
1364     // latest.
1365     return all_of(users(),
1366                   [this](VPUser *U) { return U->onlyFirstLaneUsed(this); });
1367   }
1368 };
1369 
1370 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
1371 /// or stores into one wide load/store and shuffles. The first operand of a
1372 /// VPInterleave recipe is the address, followed by the stored values, followed
1373 /// by an optional mask.
1374 class VPInterleaveRecipe : public VPRecipeBase {
1375   const InterleaveGroup<Instruction> *IG;
1376 
1377   bool HasMask = false;
1378 
1379 public:
1380   VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
1381                      ArrayRef<VPValue *> StoredValues, VPValue *Mask)
1382       : VPRecipeBase(VPDef::VPInterleaveSC, {Addr}), IG(IG) {
1383     for (unsigned i = 0; i < IG->getFactor(); ++i)
1384       if (Instruction *I = IG->getMember(i)) {
1385         if (I->getType()->isVoidTy())
1386           continue;
1387         new VPValue(I, this);
1388       }
1389 
1390     for (auto *SV : StoredValues)
1391       addOperand(SV);
1392     if (Mask) {
1393       HasMask = true;
1394       addOperand(Mask);
1395     }
1396   }
1397   ~VPInterleaveRecipe() override = default;
1398 
1399   VP_CLASSOF_IMPL(VPDef::VPInterleaveSC)
1400 
1401   /// Return the address accessed by this recipe.
1402   VPValue *getAddr() const {
1403     return getOperand(0); // Address is the 1st, mandatory operand.
1404   }
1405 
1406   /// Return the mask used by this recipe. Note that a full mask is represented
1407   /// by a nullptr.
1408   VPValue *getMask() const {
1409     // Mask is optional and therefore the last, currently 2nd operand.
1410     return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
1411   }
1412 
1413   /// Return the VPValues stored by this interleave group. If it is a load
1414   /// interleave group, return an empty ArrayRef.
1415   ArrayRef<VPValue *> getStoredValues() const {
1416     // The first operand is the address, followed by the stored values, followed
1417     // by an optional mask.
1418     return ArrayRef<VPValue *>(op_begin(), getNumOperands())
1419         .slice(1, getNumStoreOperands());
1420   }
1421 
1422   /// Generate the wide load or store, and shuffles.
1423   void execute(VPTransformState &State) override;
1424 
1425 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1426   /// Print the recipe.
1427   void print(raw_ostream &O, const Twine &Indent,
1428              VPSlotTracker &SlotTracker) const override;
1429 #endif
1430 
1431   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
1432 
1433   /// Returns the number of stored operands of this interleave group. Returns 0
1434   /// for load interleave groups.
1435   unsigned getNumStoreOperands() const {
1436     return getNumOperands() - (HasMask ? 2 : 1);
1437   }
1438 
1439   /// The recipe only uses the first lane of the address.
1440   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1441     assert(is_contained(operands(), Op) &&
1442            "Op must be an operand of the recipe");
1443     return Op == getAddr() && !llvm::is_contained(getStoredValues(), Op);
1444   }
1445 };
1446 
1447 /// A recipe to represent inloop reduction operations, performing a reduction on
1448 /// a vector operand into a scalar value, and adding the result to a chain.
1449 /// The Operands are {ChainOp, VecOp, [Condition]}.
1450 class VPReductionRecipe : public VPRecipeBase, public VPValue {
1451   /// The recurrence decriptor for the reduction in question.
1452   const RecurrenceDescriptor *RdxDesc;
1453   /// Pointer to the TTI, needed to create the target reduction
1454   const TargetTransformInfo *TTI;
1455 
1456 public:
1457   VPReductionRecipe(const RecurrenceDescriptor *R, Instruction *I,
1458                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp,
1459                     const TargetTransformInfo *TTI)
1460       : VPRecipeBase(VPDef::VPReductionSC, {ChainOp, VecOp}), VPValue(this, I),
1461         RdxDesc(R), TTI(TTI) {
1462     if (CondOp)
1463       addOperand(CondOp);
1464   }
1465 
1466   ~VPReductionRecipe() override = default;
1467 
1468   VP_CLASSOF_IMPL(VPDef::VPReductionSC)
1469 
1470   /// Generate the reduction in the loop
1471   void execute(VPTransformState &State) override;
1472 
1473 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1474   /// Print the recipe.
1475   void print(raw_ostream &O, const Twine &Indent,
1476              VPSlotTracker &SlotTracker) const override;
1477 #endif
1478 
1479   /// The VPValue of the scalar Chain being accumulated.
1480   VPValue *getChainOp() const { return getOperand(0); }
1481   /// The VPValue of the vector value to be reduced.
1482   VPValue *getVecOp() const { return getOperand(1); }
1483   /// The VPValue of the condition for the block.
1484   VPValue *getCondOp() const {
1485     return getNumOperands() > 2 ? getOperand(2) : nullptr;
1486   }
1487 };
1488 
1489 /// VPReplicateRecipe replicates a given instruction producing multiple scalar
1490 /// copies of the original scalar type, one per lane, instead of producing a
1491 /// single copy of widened type for all lanes. If the instruction is known to be
1492 /// uniform only one copy, per lane zero, will be generated.
1493 class VPReplicateRecipe : public VPRecipeBase, public VPValue {
1494   /// Indicator if only a single replica per lane is needed.
1495   bool IsUniform;
1496 
1497   /// Indicator if the replicas are also predicated.
1498   bool IsPredicated;
1499 
1500   /// Indicator if the scalar values should also be packed into a vector.
1501   bool AlsoPack;
1502 
1503 public:
1504   template <typename IterT>
1505   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
1506                     bool IsUniform, bool IsPredicated = false)
1507       : VPRecipeBase(VPDef::VPReplicateSC, Operands), VPValue(this, I),
1508         IsUniform(IsUniform), IsPredicated(IsPredicated) {
1509     // Retain the previous behavior of predicateInstructions(), where an
1510     // insert-element of a predicated instruction got hoisted into the
1511     // predicated basic block iff it was its only user. This is achieved by
1512     // having predicated instructions also pack their values into a vector by
1513     // default unless they have a replicated user which uses their scalar value.
1514     AlsoPack = IsPredicated && !I->use_empty();
1515   }
1516 
1517   ~VPReplicateRecipe() override = default;
1518 
1519   VP_CLASSOF_IMPL(VPDef::VPReplicateSC)
1520 
1521   /// Generate replicas of the desired Ingredient. Replicas will be generated
1522   /// for all parts and lanes unless a specific part and lane are specified in
1523   /// the \p State.
1524   void execute(VPTransformState &State) override;
1525 
1526   void setAlsoPack(bool Pack) { AlsoPack = Pack; }
1527 
1528 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1529   /// Print the recipe.
1530   void print(raw_ostream &O, const Twine &Indent,
1531              VPSlotTracker &SlotTracker) const override;
1532 #endif
1533 
1534   bool isUniform() const { return IsUniform; }
1535 
1536   bool isPacked() const { return AlsoPack; }
1537 
1538   bool isPredicated() const { return IsPredicated; }
1539 
1540   /// Returns true if the recipe only uses the first lane of operand \p Op.
1541   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1542     assert(is_contained(operands(), Op) &&
1543            "Op must be an operand of the recipe");
1544     return isUniform();
1545   }
1546 
1547   /// Returns true if the recipe uses scalars of operand \p Op.
1548   bool usesScalars(const VPValue *Op) const override {
1549     assert(is_contained(operands(), Op) &&
1550            "Op must be an operand of the recipe");
1551     return true;
1552   }
1553 };
1554 
1555 /// A recipe for generating conditional branches on the bits of a mask.
1556 class VPBranchOnMaskRecipe : public VPRecipeBase {
1557 public:
1558   VPBranchOnMaskRecipe(VPValue *BlockInMask)
1559       : VPRecipeBase(VPDef::VPBranchOnMaskSC, {}) {
1560     if (BlockInMask) // nullptr means all-one mask.
1561       addOperand(BlockInMask);
1562   }
1563 
1564   VP_CLASSOF_IMPL(VPDef::VPBranchOnMaskSC)
1565 
1566   /// Generate the extraction of the appropriate bit from the block mask and the
1567   /// conditional branch.
1568   void execute(VPTransformState &State) override;
1569 
1570 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1571   /// Print the recipe.
1572   void print(raw_ostream &O, const Twine &Indent,
1573              VPSlotTracker &SlotTracker) const override {
1574     O << Indent << "BRANCH-ON-MASK ";
1575     if (VPValue *Mask = getMask())
1576       Mask->printAsOperand(O, SlotTracker);
1577     else
1578       O << " All-One";
1579   }
1580 #endif
1581 
1582   /// Return the mask used by this recipe. Note that a full mask is represented
1583   /// by a nullptr.
1584   VPValue *getMask() const {
1585     assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
1586     // Mask is optional.
1587     return getNumOperands() == 1 ? getOperand(0) : nullptr;
1588   }
1589 
1590   /// Returns true if the recipe uses scalars of operand \p Op.
1591   bool usesScalars(const VPValue *Op) const override {
1592     assert(is_contained(operands(), Op) &&
1593            "Op must be an operand of the recipe");
1594     return true;
1595   }
1596 };
1597 
1598 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
1599 /// control converges back from a Branch-on-Mask. The phi nodes are needed in
1600 /// order to merge values that are set under such a branch and feed their uses.
1601 /// The phi nodes can be scalar or vector depending on the users of the value.
1602 /// This recipe works in concert with VPBranchOnMaskRecipe.
1603 class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
1604 public:
1605   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
1606   /// nodes after merging back from a Branch-on-Mask.
1607   VPPredInstPHIRecipe(VPValue *PredV)
1608       : VPRecipeBase(VPDef::VPPredInstPHISC, PredV), VPValue(this) {}
1609   ~VPPredInstPHIRecipe() override = default;
1610 
1611   VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
1612 
1613   /// Generates phi nodes for live-outs as needed to retain SSA form.
1614   void execute(VPTransformState &State) override;
1615 
1616 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1617   /// Print the recipe.
1618   void print(raw_ostream &O, const Twine &Indent,
1619              VPSlotTracker &SlotTracker) const override;
1620 #endif
1621 
1622   /// Returns true if the recipe uses scalars of operand \p Op.
1623   bool usesScalars(const VPValue *Op) const override {
1624     assert(is_contained(operands(), Op) &&
1625            "Op must be an operand of the recipe");
1626     return true;
1627   }
1628 };
1629 
1630 /// A Recipe for widening load/store operations.
1631 /// The recipe uses the following VPValues:
1632 /// - For load: Address, optional mask
1633 /// - For store: Address, stored value, optional mask
1634 /// TODO: We currently execute only per-part unless a specific instance is
1635 /// provided.
1636 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
1637   Instruction &Ingredient;
1638 
1639   // Whether the loaded-from / stored-to addresses are consecutive.
1640   bool Consecutive;
1641 
1642   // Whether the consecutive loaded/stored addresses are in reverse order.
1643   bool Reverse;
1644 
1645   void setMask(VPValue *Mask) {
1646     if (!Mask)
1647       return;
1648     addOperand(Mask);
1649   }
1650 
1651   bool isMasked() const {
1652     return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
1653   }
1654 
1655 public:
1656   VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask,
1657                                  bool Consecutive, bool Reverse)
1658       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr}),
1659         Ingredient(Load), Consecutive(Consecutive), Reverse(Reverse) {
1660     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
1661     new VPValue(this, &Load);
1662     setMask(Mask);
1663   }
1664 
1665   VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
1666                                  VPValue *StoredValue, VPValue *Mask,
1667                                  bool Consecutive, bool Reverse)
1668       : VPRecipeBase(VPDef::VPWidenMemoryInstructionSC, {Addr, StoredValue}),
1669         Ingredient(Store), Consecutive(Consecutive), Reverse(Reverse) {
1670     assert((Consecutive || !Reverse) && "Reverse implies consecutive");
1671     setMask(Mask);
1672   }
1673 
1674   VP_CLASSOF_IMPL(VPDef::VPWidenMemoryInstructionSC)
1675 
1676   /// Return the address accessed by this recipe.
1677   VPValue *getAddr() const {
1678     return getOperand(0); // Address is the 1st, mandatory operand.
1679   }
1680 
1681   /// Return the mask used by this recipe. Note that a full mask is represented
1682   /// by a nullptr.
1683   VPValue *getMask() const {
1684     // Mask is optional and therefore the last operand.
1685     return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
1686   }
1687 
1688   /// Returns true if this recipe is a store.
1689   bool isStore() const { return isa<StoreInst>(Ingredient); }
1690 
1691   /// Return the address accessed by this recipe.
1692   VPValue *getStoredValue() const {
1693     assert(isStore() && "Stored value only available for store instructions");
1694     return getOperand(1); // Stored value is the 2nd, mandatory operand.
1695   }
1696 
1697   // Return whether the loaded-from / stored-to addresses are consecutive.
1698   bool isConsecutive() const { return Consecutive; }
1699 
1700   // Return whether the consecutive loaded/stored addresses are in reverse
1701   // order.
1702   bool isReverse() const { return Reverse; }
1703 
1704   /// Generate the wide load/store.
1705   void execute(VPTransformState &State) override;
1706 
1707 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1708   /// Print the recipe.
1709   void print(raw_ostream &O, const Twine &Indent,
1710              VPSlotTracker &SlotTracker) const override;
1711 #endif
1712 
1713   /// Returns true if the recipe only uses the first lane of operand \p Op.
1714   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1715     assert(is_contained(operands(), Op) &&
1716            "Op must be an operand of the recipe");
1717 
1718     // Widened, consecutive memory operations only demand the first lane of
1719     // their address, unless the same operand is also stored. That latter can
1720     // happen with opaque pointers.
1721     return Op == getAddr() && isConsecutive() &&
1722            (!isStore() || Op != getStoredValue());
1723   }
1724 
1725   Instruction &getIngredient() const { return Ingredient; }
1726 };
1727 
1728 /// Recipe to expand a SCEV expression.
1729 class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
1730   const SCEV *Expr;
1731   ScalarEvolution &SE;
1732 
1733 public:
1734   VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
1735       : VPRecipeBase(VPDef::VPExpandSCEVSC, {}), VPValue(this), Expr(Expr),
1736         SE(SE) {}
1737 
1738   ~VPExpandSCEVRecipe() override = default;
1739 
1740   VP_CLASSOF_IMPL(VPDef::VPExpandSCEVSC)
1741 
1742   /// Generate a canonical vector induction variable of the vector loop, with
1743   void execute(VPTransformState &State) override;
1744 
1745 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1746   /// Print the recipe.
1747   void print(raw_ostream &O, const Twine &Indent,
1748              VPSlotTracker &SlotTracker) const override;
1749 #endif
1750 
1751   const SCEV *getSCEV() const { return Expr; }
1752 };
1753 
1754 /// Canonical scalar induction phi of the vector loop. Starting at the specified
1755 /// start value (either 0 or the resume value when vectorizing the epilogue
1756 /// loop). VPWidenCanonicalIVRecipe represents the vector version of the
1757 /// canonical induction variable.
1758 class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
1759   DebugLoc DL;
1760 
1761 public:
1762   VPCanonicalIVPHIRecipe(VPValue *StartV, DebugLoc DL)
1763       : VPHeaderPHIRecipe(VPDef::VPCanonicalIVPHISC, nullptr, StartV), DL(DL) {}
1764 
1765   ~VPCanonicalIVPHIRecipe() override = default;
1766 
1767   VP_CLASSOF_IMPL(VPDef::VPCanonicalIVPHISC)
1768 
1769   static inline bool classof(const VPHeaderPHIRecipe *D) {
1770     return D->getVPDefID() == VPDef::VPCanonicalIVPHISC;
1771   }
1772 
1773   /// Generate the canonical scalar induction phi of the vector loop.
1774   void execute(VPTransformState &State) override;
1775 
1776 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1777   /// Print the recipe.
1778   void print(raw_ostream &O, const Twine &Indent,
1779              VPSlotTracker &SlotTracker) const override;
1780 #endif
1781 
1782   /// Returns the scalar type of the induction.
1783   const Type *getScalarType() const {
1784     return getOperand(0)->getLiveInIRValue()->getType();
1785   }
1786 
1787   /// Returns true if the recipe only uses the first lane of operand \p Op.
1788   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1789     assert(is_contained(operands(), Op) &&
1790            "Op must be an operand of the recipe");
1791     return true;
1792   }
1793 
1794   /// Check if the induction described by \p ID is canonical, i.e.  has the same
1795   /// start, step (of 1), and type as the canonical IV.
1796   bool isCanonical(const InductionDescriptor &ID, Type *Ty) const;
1797 };
1798 
1799 /// A recipe for generating the active lane mask for the vector loop that is
1800 /// used to predicate the vector operations.
1801 /// TODO: It would be good to use the existing VPWidenPHIRecipe instead and
1802 /// remove VPActiveLaneMaskPHIRecipe.
1803 class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
1804   DebugLoc DL;
1805 
1806 public:
1807   VPActiveLaneMaskPHIRecipe(VPValue *StartMask, DebugLoc DL)
1808       : VPHeaderPHIRecipe(VPDef::VPActiveLaneMaskPHISC, nullptr, StartMask),
1809         DL(DL) {}
1810 
1811   ~VPActiveLaneMaskPHIRecipe() override = default;
1812 
1813   VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
1814 
1815   static inline bool classof(const VPHeaderPHIRecipe *D) {
1816     return D->getVPDefID() == VPDef::VPActiveLaneMaskPHISC;
1817   }
1818 
1819   /// Generate the active lane mask phi of the vector loop.
1820   void execute(VPTransformState &State) override;
1821 
1822 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1823   /// Print the recipe.
1824   void print(raw_ostream &O, const Twine &Indent,
1825              VPSlotTracker &SlotTracker) const override;
1826 #endif
1827 };
1828 
1829 /// A Recipe for widening the canonical induction variable of the vector loop.
1830 class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
1831 public:
1832   VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
1833       : VPRecipeBase(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}),
1834         VPValue(this) {}
1835 
1836   ~VPWidenCanonicalIVRecipe() override = default;
1837 
1838   VP_CLASSOF_IMPL(VPDef::VPWidenCanonicalIVSC)
1839 
1840   /// Generate a canonical vector induction variable of the vector loop, with
1841   /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
1842   /// step = <VF*UF, VF*UF, ..., VF*UF>.
1843   void execute(VPTransformState &State) override;
1844 
1845 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1846   /// Print the recipe.
1847   void print(raw_ostream &O, const Twine &Indent,
1848              VPSlotTracker &SlotTracker) const override;
1849 #endif
1850 
1851   /// Returns the scalar type of the induction.
1852   const Type *getScalarType() const {
1853     return cast<VPCanonicalIVPHIRecipe>(getOperand(0)->getDefiningRecipe())
1854         ->getScalarType();
1855   }
1856 };
1857 
1858 /// A recipe for converting the canonical IV value to the corresponding value of
1859 /// an IV with different start and step values, using Start + CanonicalIV *
1860 /// Step.
1861 class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
1862   /// The type of the result value. It may be smaller than the type of the
1863   /// induction and in this case it will get truncated to ResultTy.
1864   Type *ResultTy;
1865 
1866   /// Induction descriptor for the induction the canonical IV is transformed to.
1867   const InductionDescriptor &IndDesc;
1868 
1869 public:
1870   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
1871                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
1872                     Type *ResultTy)
1873       : VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
1874         VPValue(this), ResultTy(ResultTy), IndDesc(IndDesc) {}
1875 
1876   ~VPDerivedIVRecipe() override = default;
1877 
1878   VP_CLASSOF_IMPL(VPDef::VPDerivedIVSC)
1879 
1880   /// Generate the transformed value of the induction at offset StartValue (1.
1881   /// operand) + IV (2. operand) * StepValue (3, operand).
1882   void execute(VPTransformState &State) override;
1883 
1884 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1885   /// Print the recipe.
1886   void print(raw_ostream &O, const Twine &Indent,
1887              VPSlotTracker &SlotTracker) const override;
1888 #endif
1889 
1890   VPValue *getStartValue() const { return getOperand(0); }
1891   VPValue *getCanonicalIV() const { return getOperand(1); }
1892   VPValue *getStepValue() const { return getOperand(2); }
1893 
1894   /// Returns true if the recipe only uses the first lane of operand \p Op.
1895   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1896     assert(is_contained(operands(), Op) &&
1897            "Op must be an operand of the recipe");
1898     return true;
1899   }
1900 };
1901 
1902 /// A recipe for handling phi nodes of integer and floating-point inductions,
1903 /// producing their scalar values.
1904 class VPScalarIVStepsRecipe : public VPRecipeBase, public VPValue {
1905   const InductionDescriptor &IndDesc;
1906 
1907 public:
1908   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
1909                         VPValue *Step)
1910       : VPRecipeBase(VPDef::VPScalarIVStepsSC, {IV, Step}), VPValue(this),
1911         IndDesc(IndDesc) {}
1912 
1913   ~VPScalarIVStepsRecipe() override = default;
1914 
1915   VP_CLASSOF_IMPL(VPDef::VPScalarIVStepsSC)
1916 
1917   /// Generate the scalarized versions of the phi node as needed by their users.
1918   void execute(VPTransformState &State) override;
1919 
1920 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
1921   /// Print the recipe.
1922   void print(raw_ostream &O, const Twine &Indent,
1923              VPSlotTracker &SlotTracker) const override;
1924 #endif
1925 
1926   VPValue *getStepValue() const { return getOperand(1); }
1927 
1928   /// Returns true if the recipe only uses the first lane of operand \p Op.
1929   bool onlyFirstLaneUsed(const VPValue *Op) const override {
1930     assert(is_contained(operands(), Op) &&
1931            "Op must be an operand of the recipe");
1932     return true;
1933   }
1934 };
1935 
1936 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
1937 /// holds a sequence of zero or more VPRecipe's each representing a sequence of
1938 /// output IR instructions. All PHI-like recipes must come before any non-PHI recipes.
1939 class VPBasicBlock : public VPBlockBase {
1940 public:
1941   using RecipeListTy = iplist<VPRecipeBase>;
1942 
1943 private:
1944   /// The VPRecipes held in the order of output instructions to generate.
1945   RecipeListTy Recipes;
1946 
1947 public:
1948   VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
1949       : VPBlockBase(VPBasicBlockSC, Name.str()) {
1950     if (Recipe)
1951       appendRecipe(Recipe);
1952   }
1953 
1954   ~VPBasicBlock() override {
1955     while (!Recipes.empty())
1956       Recipes.pop_back();
1957   }
1958 
1959   /// Instruction iterators...
1960   using iterator = RecipeListTy::iterator;
1961   using const_iterator = RecipeListTy::const_iterator;
1962   using reverse_iterator = RecipeListTy::reverse_iterator;
1963   using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
1964 
1965   //===--------------------------------------------------------------------===//
1966   /// Recipe iterator methods
1967   ///
1968   inline iterator begin() { return Recipes.begin(); }
1969   inline const_iterator begin() const { return Recipes.begin(); }
1970   inline iterator end() { return Recipes.end(); }
1971   inline const_iterator end() const { return Recipes.end(); }
1972 
1973   inline reverse_iterator rbegin() { return Recipes.rbegin(); }
1974   inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
1975   inline reverse_iterator rend() { return Recipes.rend(); }
1976   inline const_reverse_iterator rend() const { return Recipes.rend(); }
1977 
1978   inline size_t size() const { return Recipes.size(); }
1979   inline bool empty() const { return Recipes.empty(); }
1980   inline const VPRecipeBase &front() const { return Recipes.front(); }
1981   inline VPRecipeBase &front() { return Recipes.front(); }
1982   inline const VPRecipeBase &back() const { return Recipes.back(); }
1983   inline VPRecipeBase &back() { return Recipes.back(); }
1984 
1985   /// Returns a reference to the list of recipes.
1986   RecipeListTy &getRecipeList() { return Recipes; }
1987 
1988   /// Returns a pointer to a member of the recipe list.
1989   static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
1990     return &VPBasicBlock::Recipes;
1991   }
1992 
1993   /// Method to support type inquiry through isa, cast, and dyn_cast.
1994   static inline bool classof(const VPBlockBase *V) {
1995     return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
1996   }
1997 
1998   void insert(VPRecipeBase *Recipe, iterator InsertPt) {
1999     assert(Recipe && "No recipe to append.");
2000     assert(!Recipe->Parent && "Recipe already in VPlan");
2001     Recipe->Parent = this;
2002     Recipes.insert(InsertPt, Recipe);
2003   }
2004 
2005   /// Augment the existing recipes of a VPBasicBlock with an additional
2006   /// \p Recipe as the last recipe.
2007   void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
2008 
2009   /// The method which generates the output IR instructions that correspond to
2010   /// this VPBasicBlock, thereby "executing" the VPlan.
2011   void execute(VPTransformState *State) override;
2012 
2013   /// Return the position of the first non-phi node recipe in the block.
2014   iterator getFirstNonPhi();
2015 
2016   /// Returns an iterator range over the PHI-like recipes in the block.
2017   iterator_range<iterator> phis() {
2018     return make_range(begin(), getFirstNonPhi());
2019   }
2020 
2021   void dropAllReferences(VPValue *NewValue) override;
2022 
2023   /// Split current block at \p SplitAt by inserting a new block between the
2024   /// current block and its successors and moving all recipes starting at
2025   /// SplitAt to the new block. Returns the new block.
2026   VPBasicBlock *splitAt(iterator SplitAt);
2027 
2028   VPRegionBlock *getEnclosingLoopRegion();
2029 
2030 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2031   /// Print this VPBsicBlock to \p O, prefixing all lines with \p Indent. \p
2032   /// SlotTracker is used to print unnamed VPValue's using consequtive numbers.
2033   ///
2034   /// Note that the numbering is applied to the whole VPlan, so printing
2035   /// individual blocks is consistent with the whole VPlan printing.
2036   void print(raw_ostream &O, const Twine &Indent,
2037              VPSlotTracker &SlotTracker) const override;
2038   using VPBlockBase::print; // Get the print(raw_stream &O) version.
2039 #endif
2040 
2041   /// If the block has multiple successors, return the branch recipe terminating
2042   /// the block. If there are no or only a single successor, return nullptr;
2043   VPRecipeBase *getTerminator();
2044   const VPRecipeBase *getTerminator() const;
2045 
2046   /// Returns true if the block is exiting it's parent region.
2047   bool isExiting() const;
2048 
2049 private:
2050   /// Create an IR BasicBlock to hold the output instructions generated by this
2051   /// VPBasicBlock, and return it. Update the CFGState accordingly.
2052   BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
2053 };
2054 
2055 /// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
2056 /// which form a Single-Entry-Single-Exiting subgraph of the output IR CFG.
2057 /// A VPRegionBlock may indicate that its contents are to be replicated several
2058 /// times. This is designed to support predicated scalarization, in which a
2059 /// scalar if-then code structure needs to be generated VF * UF times. Having
2060 /// this replication indicator helps to keep a single model for multiple
2061 /// candidate VF's. The actual replication takes place only once the desired VF
2062 /// and UF have been determined.
2063 class VPRegionBlock : public VPBlockBase {
2064   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
2065   VPBlockBase *Entry;
2066 
2067   /// Hold the Single Exiting block of the SESE region modelled by the
2068   /// VPRegionBlock.
2069   VPBlockBase *Exiting;
2070 
2071   /// An indicator whether this region is to generate multiple replicated
2072   /// instances of output IR corresponding to its VPBlockBases.
2073   bool IsReplicator;
2074 
2075 public:
2076   VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exiting,
2077                 const std::string &Name = "", bool IsReplicator = false)
2078       : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exiting(Exiting),
2079         IsReplicator(IsReplicator) {
2080     assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
2081     assert(Exiting->getSuccessors().empty() && "Exit block has successors.");
2082     Entry->setParent(this);
2083     Exiting->setParent(this);
2084   }
2085   VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
2086       : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exiting(nullptr),
2087         IsReplicator(IsReplicator) {}
2088 
2089   ~VPRegionBlock() override {
2090     if (Entry) {
2091       VPValue DummyValue;
2092       Entry->dropAllReferences(&DummyValue);
2093       deleteCFG(Entry);
2094     }
2095   }
2096 
2097   /// Method to support type inquiry through isa, cast, and dyn_cast.
2098   static inline bool classof(const VPBlockBase *V) {
2099     return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
2100   }
2101 
2102   const VPBlockBase *getEntry() const { return Entry; }
2103   VPBlockBase *getEntry() { return Entry; }
2104 
2105   /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
2106   /// EntryBlock must have no predecessors.
2107   void setEntry(VPBlockBase *EntryBlock) {
2108     assert(EntryBlock->getPredecessors().empty() &&
2109            "Entry block cannot have predecessors.");
2110     Entry = EntryBlock;
2111     EntryBlock->setParent(this);
2112   }
2113 
2114   const VPBlockBase *getExiting() const { return Exiting; }
2115   VPBlockBase *getExiting() { return Exiting; }
2116 
2117   /// Set \p ExitingBlock as the exiting VPBlockBase of this VPRegionBlock. \p
2118   /// ExitingBlock must have no successors.
2119   void setExiting(VPBlockBase *ExitingBlock) {
2120     assert(ExitingBlock->getSuccessors().empty() &&
2121            "Exit block cannot have successors.");
2122     Exiting = ExitingBlock;
2123     ExitingBlock->setParent(this);
2124   }
2125 
2126   /// Returns the pre-header VPBasicBlock of the loop region.
2127   VPBasicBlock *getPreheaderVPBB() {
2128     assert(!isReplicator() && "should only get pre-header of loop regions");
2129     return getSinglePredecessor()->getExitingBasicBlock();
2130   }
2131 
2132   /// An indicator whether this region is to generate multiple replicated
2133   /// instances of output IR corresponding to its VPBlockBases.
2134   bool isReplicator() const { return IsReplicator; }
2135 
2136   /// The method which generates the output IR instructions that correspond to
2137   /// this VPRegionBlock, thereby "executing" the VPlan.
2138   void execute(VPTransformState *State) override;
2139 
2140   void dropAllReferences(VPValue *NewValue) override;
2141 
2142 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2143   /// Print this VPRegionBlock to \p O (recursively), prefixing all lines with
2144   /// \p Indent. \p SlotTracker is used to print unnamed VPValue's using
2145   /// consequtive numbers.
2146   ///
2147   /// Note that the numbering is applied to the whole VPlan, so printing
2148   /// individual regions is consistent with the whole VPlan printing.
2149   void print(raw_ostream &O, const Twine &Indent,
2150              VPSlotTracker &SlotTracker) const override;
2151   using VPBlockBase::print; // Get the print(raw_stream &O) version.
2152 #endif
2153 };
2154 
2155 /// VPlan models a candidate for vectorization, encoding various decisions take
2156 /// to produce efficient output IR, including which branches, basic-blocks and
2157 /// output IR instructions to generate, and their cost. VPlan holds a
2158 /// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
2159 /// VPBlock.
2160 class VPlan {
2161   friend class VPlanPrinter;
2162   friend class VPSlotTracker;
2163 
2164   /// Hold the single entry to the Hierarchical CFG of the VPlan.
2165   VPBlockBase *Entry;
2166 
2167   /// Holds the VFs applicable to this VPlan.
2168   SmallSetVector<ElementCount, 2> VFs;
2169 
2170   /// Holds the UFs applicable to this VPlan. If empty, the VPlan is valid for
2171   /// any UF.
2172   SmallSetVector<unsigned, 2> UFs;
2173 
2174   /// Holds the name of the VPlan, for printing.
2175   std::string Name;
2176 
2177   /// Holds all the external definitions created for this VPlan. External
2178   /// definitions must be immutable and hold a pointer to their underlying IR.
2179   DenseMap<Value *, VPValue *> VPExternalDefs;
2180 
2181   /// Represents the trip count of the original loop, for folding
2182   /// the tail.
2183   VPValue *TripCount = nullptr;
2184 
2185   /// Represents the backedge taken count of the original loop, for folding
2186   /// the tail. It equals TripCount - 1.
2187   VPValue *BackedgeTakenCount = nullptr;
2188 
2189   /// Represents the vector trip count.
2190   VPValue VectorTripCount;
2191 
2192   /// Holds a mapping between Values and their corresponding VPValue inside
2193   /// VPlan.
2194   Value2VPValueTy Value2VPValue;
2195 
2196   /// Contains all VPValues that been allocated by addVPValue directly and need
2197   /// to be free when the plan's destructor is called.
2198   SmallVector<VPValue *, 16> VPValuesToFree;
2199 
2200   /// Indicates whether it is safe use the Value2VPValue mapping or if the
2201   /// mapping cannot be used any longer, because it is stale.
2202   bool Value2VPValueEnabled = true;
2203 
2204   /// Values used outside the plan.
2205   MapVector<PHINode *, VPLiveOut *> LiveOuts;
2206 
2207 public:
2208   VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
2209     if (Entry)
2210       Entry->setPlan(this);
2211   }
2212 
2213   ~VPlan();
2214 
2215   /// Prepare the plan for execution, setting up the required live-in values.
2216   void prepareToExecute(Value *TripCount, Value *VectorTripCount,
2217                         Value *CanonicalIVStartValue, VPTransformState &State,
2218                         bool IsEpilogueVectorization);
2219 
2220   /// Generate the IR code for this VPlan.
2221   void execute(VPTransformState *State);
2222 
2223   VPBlockBase *getEntry() { return Entry; }
2224   const VPBlockBase *getEntry() const { return Entry; }
2225 
2226   VPBlockBase *setEntry(VPBlockBase *Block) {
2227     Entry = Block;
2228     Block->setPlan(this);
2229     return Entry;
2230   }
2231 
2232   /// The trip count of the original loop.
2233   VPValue *getOrCreateTripCount() {
2234     if (!TripCount)
2235       TripCount = new VPValue();
2236     return TripCount;
2237   }
2238 
2239   /// The backedge taken count of the original loop.
2240   VPValue *getOrCreateBackedgeTakenCount() {
2241     if (!BackedgeTakenCount)
2242       BackedgeTakenCount = new VPValue();
2243     return BackedgeTakenCount;
2244   }
2245 
2246   /// The vector trip count.
2247   VPValue &getVectorTripCount() { return VectorTripCount; }
2248 
2249   /// Mark the plan to indicate that using Value2VPValue is not safe any
2250   /// longer, because it may be stale.
2251   void disableValue2VPValue() { Value2VPValueEnabled = false; }
2252 
2253   void addVF(ElementCount VF) { VFs.insert(VF); }
2254 
2255   void setVF(ElementCount VF) {
2256     assert(hasVF(VF) && "Cannot set VF not already in plan");
2257     VFs.clear();
2258     VFs.insert(VF);
2259   }
2260 
2261   bool hasVF(ElementCount VF) { return VFs.count(VF); }
2262 
2263   bool hasScalarVFOnly() const { return VFs.size() == 1 && VFs[0].isScalar(); }
2264 
2265   bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }
2266 
2267   void setUF(unsigned UF) {
2268     assert(hasUF(UF) && "Cannot set the UF not already in plan");
2269     UFs.clear();
2270     UFs.insert(UF);
2271   }
2272 
2273   /// Return a string with the name of the plan and the applicable VFs and UFs.
2274   std::string getName() const;
2275 
2276   void setName(const Twine &newName) { Name = newName.str(); }
2277 
2278   /// Get the existing or add a new external definition for \p V.
2279   VPValue *getOrAddExternalDef(Value *V) {
2280     auto I = VPExternalDefs.insert({V, nullptr});
2281     if (I.second)
2282       I.first->second = new VPValue(V);
2283     return I.first->second;
2284   }
2285 
2286   void addVPValue(Value *V) {
2287     assert(Value2VPValueEnabled &&
2288            "IR value to VPValue mapping may be out of date!");
2289     assert(V && "Trying to add a null Value to VPlan");
2290     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
2291     VPValue *VPV = new VPValue(V);
2292     Value2VPValue[V] = VPV;
2293     VPValuesToFree.push_back(VPV);
2294   }
2295 
2296   void addVPValue(Value *V, VPValue *VPV) {
2297     assert(Value2VPValueEnabled && "Value2VPValue mapping may be out of date!");
2298     assert(V && "Trying to add a null Value to VPlan");
2299     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
2300     Value2VPValue[V] = VPV;
2301   }
2302 
2303   /// Returns the VPValue for \p V. \p OverrideAllowed can be used to disable
2304   /// checking whether it is safe to query VPValues using IR Values.
2305   VPValue *getVPValue(Value *V, bool OverrideAllowed = false) {
2306     assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
2307            "Value2VPValue mapping may be out of date!");
2308     assert(V && "Trying to get the VPValue of a null Value");
2309     assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
2310     return Value2VPValue[V];
2311   }
2312 
2313   /// Gets the VPValue or adds a new one (if none exists yet) for \p V. \p
2314   /// OverrideAllowed can be used to disable checking whether it is safe to
2315   /// query VPValues using IR Values.
2316   VPValue *getOrAddVPValue(Value *V, bool OverrideAllowed = false) {
2317     assert((OverrideAllowed || isa<Constant>(V) || Value2VPValueEnabled) &&
2318            "Value2VPValue mapping may be out of date!");
2319     assert(V && "Trying to get or add the VPValue of a null Value");
2320     if (!Value2VPValue.count(V))
2321       addVPValue(V);
2322     return getVPValue(V);
2323   }
2324 
2325   void removeVPValueFor(Value *V) {
2326     assert(Value2VPValueEnabled &&
2327            "IR value to VPValue mapping may be out of date!");
2328     Value2VPValue.erase(V);
2329   }
2330 
2331 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2332   /// Print this VPlan to \p O.
2333   void print(raw_ostream &O) const;
2334 
2335   /// Print this VPlan in DOT format to \p O.
2336   void printDOT(raw_ostream &O) const;
2337 
2338   /// Dump the plan to stderr (for debugging).
2339   LLVM_DUMP_METHOD void dump() const;
2340 #endif
2341 
2342   /// Returns a range mapping the values the range \p Operands to their
2343   /// corresponding VPValues.
2344   iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
2345   mapToVPValues(User::op_range Operands) {
2346     std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
2347       return getOrAddVPValue(Op);
2348     };
2349     return map_range(Operands, Fn);
2350   }
2351 
2352   /// Returns the VPRegionBlock of the vector loop.
2353   VPRegionBlock *getVectorLoopRegion() {
2354     return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
2355   }
2356   const VPRegionBlock *getVectorLoopRegion() const {
2357     return cast<VPRegionBlock>(getEntry()->getSingleSuccessor());
2358   }
2359 
2360   /// Returns the canonical induction recipe of the vector loop.
2361   VPCanonicalIVPHIRecipe *getCanonicalIV() {
2362     VPBasicBlock *EntryVPBB = getVectorLoopRegion()->getEntryBasicBlock();
2363     if (EntryVPBB->empty()) {
2364       // VPlan native path.
2365       EntryVPBB = cast<VPBasicBlock>(EntryVPBB->getSingleSuccessor());
2366     }
2367     return cast<VPCanonicalIVPHIRecipe>(&*EntryVPBB->begin());
2368   }
2369 
2370   /// Find and return the VPActiveLaneMaskPHIRecipe from the header - there
2371   /// be only one at most. If there isn't one, then return nullptr.
2372   VPActiveLaneMaskPHIRecipe *getActiveLaneMaskPhi();
2373 
2374   void addLiveOut(PHINode *PN, VPValue *V);
2375 
2376   void clearLiveOuts() {
2377     for (auto &KV : LiveOuts)
2378       delete KV.second;
2379     LiveOuts.clear();
2380   }
2381 
2382   void removeLiveOut(PHINode *PN) {
2383     delete LiveOuts[PN];
2384     LiveOuts.erase(PN);
2385   }
2386 
2387   const MapVector<PHINode *, VPLiveOut *> &getLiveOuts() const {
2388     return LiveOuts;
2389   }
2390 
2391 private:
2392   /// Add to the given dominator tree the header block and every new basic block
2393   /// that was created between it and the latch block, inclusive.
2394   static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
2395                                   BasicBlock *LoopPreHeaderBB,
2396                                   BasicBlock *LoopExitBB);
2397 };
2398 
2399 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2400 /// VPlanPrinter prints a given VPlan to a given output stream. The printing is
2401 /// indented and follows the dot format.
2402 class VPlanPrinter {
2403   raw_ostream &OS;
2404   const VPlan &Plan;
2405   unsigned Depth = 0;
2406   unsigned TabWidth = 2;
2407   std::string Indent;
2408   unsigned BID = 0;
2409   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
2410 
2411   VPSlotTracker SlotTracker;
2412 
2413   /// Handle indentation.
2414   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
2415 
2416   /// Print a given \p Block of the Plan.
2417   void dumpBlock(const VPBlockBase *Block);
2418 
2419   /// Print the information related to the CFG edges going out of a given
2420   /// \p Block, followed by printing the successor blocks themselves.
2421   void dumpEdges(const VPBlockBase *Block);
2422 
2423   /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
2424   /// its successor blocks.
2425   void dumpBasicBlock(const VPBasicBlock *BasicBlock);
2426 
2427   /// Print a given \p Region of the Plan.
2428   void dumpRegion(const VPRegionBlock *Region);
2429 
2430   unsigned getOrCreateBID(const VPBlockBase *Block) {
2431     return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
2432   }
2433 
2434   Twine getOrCreateName(const VPBlockBase *Block);
2435 
2436   Twine getUID(const VPBlockBase *Block);
2437 
2438   /// Print the information related to a CFG edge between two VPBlockBases.
2439   void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
2440                 const Twine &Label);
2441 
2442 public:
2443   VPlanPrinter(raw_ostream &O, const VPlan &P)
2444       : OS(O), Plan(P), SlotTracker(&P) {}
2445 
2446   LLVM_DUMP_METHOD void dump();
2447 };
2448 
2449 struct VPlanIngredient {
2450   const Value *V;
2451 
2452   VPlanIngredient(const Value *V) : V(V) {}
2453 
2454   void print(raw_ostream &O) const;
2455 };
2456 
2457 inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
2458   I.print(OS);
2459   return OS;
2460 }
2461 
2462 inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
2463   Plan.print(OS);
2464   return OS;
2465 }
2466 #endif
2467 
2468 //===----------------------------------------------------------------------===//
2469 // VPlan Utilities
2470 //===----------------------------------------------------------------------===//
2471 
2472 /// Class that provides utilities for VPBlockBases in VPlan.
2473 class VPBlockUtils {
2474 public:
2475   VPBlockUtils() = delete;
2476 
2477   /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
2478   /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
2479   /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. \p BlockPtr's
2480   /// successors are moved from \p BlockPtr to \p NewBlock. \p NewBlock must
2481   /// have neither successors nor predecessors.
2482   static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
2483     assert(NewBlock->getSuccessors().empty() &&
2484            NewBlock->getPredecessors().empty() &&
2485            "Can't insert new block with predecessors or successors.");
2486     NewBlock->setParent(BlockPtr->getParent());
2487     SmallVector<VPBlockBase *> Succs(BlockPtr->successors());
2488     for (VPBlockBase *Succ : Succs) {
2489       disconnectBlocks(BlockPtr, Succ);
2490       connectBlocks(NewBlock, Succ);
2491     }
2492     connectBlocks(BlockPtr, NewBlock);
2493   }
2494 
2495   /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
2496   /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
2497   /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
2498   /// parent to \p IfTrue and \p IfFalse. \p BlockPtr must have no successors
2499   /// and \p IfTrue and \p IfFalse must have neither successors nor
2500   /// predecessors.
2501   static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
2502                                    VPBlockBase *BlockPtr) {
2503     assert(IfTrue->getSuccessors().empty() &&
2504            "Can't insert IfTrue with successors.");
2505     assert(IfFalse->getSuccessors().empty() &&
2506            "Can't insert IfFalse with successors.");
2507     BlockPtr->setTwoSuccessors(IfTrue, IfFalse);
2508     IfTrue->setPredecessors({BlockPtr});
2509     IfFalse->setPredecessors({BlockPtr});
2510     IfTrue->setParent(BlockPtr->getParent());
2511     IfFalse->setParent(BlockPtr->getParent());
2512   }
2513 
2514   /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
2515   /// the successors of \p From and \p From to the predecessors of \p To. Both
2516   /// VPBlockBases must have the same parent, which can be null. Both
2517   /// VPBlockBases can be already connected to other VPBlockBases.
2518   static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
2519     assert((From->getParent() == To->getParent()) &&
2520            "Can't connect two block with different parents");
2521     assert(From->getNumSuccessors() < 2 &&
2522            "Blocks can't have more than two successors.");
2523     From->appendSuccessor(To);
2524     To->appendPredecessor(From);
2525   }
2526 
2527   /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
2528   /// from the successors of \p From and \p From from the predecessors of \p To.
2529   static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
2530     assert(To && "Successor to disconnect is null.");
2531     From->removeSuccessor(To);
2532     To->removePredecessor(From);
2533   }
2534 
2535   /// Return an iterator range over \p Range which only includes \p BlockTy
2536   /// blocks. The accesses are casted to \p BlockTy.
2537   template <typename BlockTy, typename T>
2538   static auto blocksOnly(const T &Range) {
2539     // Create BaseTy with correct const-ness based on BlockTy.
2540     using BaseTy = std::conditional_t<std::is_const<BlockTy>::value,
2541                                       const VPBlockBase, VPBlockBase>;
2542 
2543     // We need to first create an iterator range over (const) BlocktTy & instead
2544     // of (const) BlockTy * for filter_range to work properly.
2545     auto Mapped =
2546         map_range(Range, [](BaseTy *Block) -> BaseTy & { return *Block; });
2547     auto Filter = make_filter_range(
2548         Mapped, [](BaseTy &Block) { return isa<BlockTy>(&Block); });
2549     return map_range(Filter, [](BaseTy &Block) -> BlockTy * {
2550       return cast<BlockTy>(&Block);
2551     });
2552   }
2553 };
2554 
2555 class VPInterleavedAccessInfo {
2556   DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
2557       InterleaveGroupMap;
2558 
2559   /// Type for mapping of instruction based interleave groups to VPInstruction
2560   /// interleave groups
2561   using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
2562                              InterleaveGroup<VPInstruction> *>;
2563 
2564   /// Recursively \p Region and populate VPlan based interleave groups based on
2565   /// \p IAI.
2566   void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
2567                    InterleavedAccessInfo &IAI);
2568   /// Recursively traverse \p Block and populate VPlan based interleave groups
2569   /// based on \p IAI.
2570   void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
2571                   InterleavedAccessInfo &IAI);
2572 
2573 public:
2574   VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
2575 
2576   ~VPInterleavedAccessInfo() {
2577     SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
2578     // Avoid releasing a pointer twice.
2579     for (auto &I : InterleaveGroupMap)
2580       DelSet.insert(I.second);
2581     for (auto *Ptr : DelSet)
2582       delete Ptr;
2583   }
2584 
2585   /// Get the interleave group that \p Instr belongs to.
2586   ///
2587   /// \returns nullptr if doesn't have such group.
2588   InterleaveGroup<VPInstruction> *
2589   getInterleaveGroup(VPInstruction *Instr) const {
2590     return InterleaveGroupMap.lookup(Instr);
2591   }
2592 };
2593 
2594 /// Class that maps (parts of) an existing VPlan to trees of combined
2595 /// VPInstructions.
2596 class VPlanSlp {
2597   enum class OpMode { Failed, Load, Opcode };
2598 
2599   /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
2600   /// DenseMap keys.
2601   struct BundleDenseMapInfo {
2602     static SmallVector<VPValue *, 4> getEmptyKey() {
2603       return {reinterpret_cast<VPValue *>(-1)};
2604     }
2605 
2606     static SmallVector<VPValue *, 4> getTombstoneKey() {
2607       return {reinterpret_cast<VPValue *>(-2)};
2608     }
2609 
2610     static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
2611       return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
2612     }
2613 
2614     static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
2615                         const SmallVector<VPValue *, 4> &RHS) {
2616       return LHS == RHS;
2617     }
2618   };
2619 
2620   /// Mapping of values in the original VPlan to a combined VPInstruction.
2621   DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
2622       BundleToCombined;
2623 
2624   VPInterleavedAccessInfo &IAI;
2625 
2626   /// Basic block to operate on. For now, only instructions in a single BB are
2627   /// considered.
2628   const VPBasicBlock &BB;
2629 
2630   /// Indicates whether we managed to combine all visited instructions or not.
2631   bool CompletelySLP = true;
2632 
2633   /// Width of the widest combined bundle in bits.
2634   unsigned WidestBundleBits = 0;
2635 
2636   using MultiNodeOpTy =
2637       typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
2638 
2639   // Input operand bundles for the current multi node. Each multi node operand
2640   // bundle contains values not matching the multi node's opcode. They will
2641   // be reordered in reorderMultiNodeOps, once we completed building a
2642   // multi node.
2643   SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
2644 
2645   /// Indicates whether we are building a multi node currently.
2646   bool MultiNodeActive = false;
2647 
2648   /// Check if we can vectorize Operands together.
2649   bool areVectorizable(ArrayRef<VPValue *> Operands) const;
2650 
2651   /// Add combined instruction \p New for the bundle \p Operands.
2652   void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
2653 
2654   /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
2655   VPInstruction *markFailed();
2656 
2657   /// Reorder operands in the multi node to maximize sequential memory access
2658   /// and commutative operations.
2659   SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
2660 
2661   /// Choose the best candidate to use for the lane after \p Last. The set of
2662   /// candidates to choose from are values with an opcode matching \p Last's
2663   /// or loads consecutive to \p Last.
2664   std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
2665                                        SmallPtrSetImpl<VPValue *> &Candidates,
2666                                        VPInterleavedAccessInfo &IAI);
2667 
2668 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
2669   /// Print bundle \p Values to dbgs().
2670   void dumpBundle(ArrayRef<VPValue *> Values);
2671 #endif
2672 
2673 public:
2674   VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
2675 
2676   ~VPlanSlp() = default;
2677 
2678   /// Tries to build an SLP tree rooted at \p Operands and returns a
2679   /// VPInstruction combining \p Operands, if they can be combined.
2680   VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
2681 
2682   /// Return the width of the widest combined bundle in bits.
2683   unsigned getWidestBundleBits() const { return WidestBundleBits; }
2684 
2685   /// Return true if all visited instruction can be combined.
2686   bool isCompletelySLP() const { return CompletelySLP; }
2687 };
2688 
2689 namespace vputils {
2690 
2691 /// Returns true if only the first lane of \p Def is used.
2692 bool onlyFirstLaneUsed(VPValue *Def);
2693 
2694 /// Get or create a VPValue that corresponds to the expansion of \p Expr. If \p
2695 /// Expr is a SCEVConstant or SCEVUnknown, return a VPValue wrapping the live-in
2696 /// value. Otherwise return a VPExpandSCEVRecipe to expand \p Expr. If \p Plan's
2697 /// pre-header already contains a recipe expanding \p Expr, return it. If not,
2698 /// create a new one.
2699 VPValue *getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr,
2700                                        ScalarEvolution &SE);
2701 
2702 /// Returns true if \p VPV is uniform after vectorization.
2703 inline bool isUniformAfterVectorization(VPValue *VPV) {
2704   // A value defined outside the vector region must be uniform after
2705   // vectorization inside a vector region.
2706   if (VPV->isDefinedOutsideVectorRegions())
2707     return true;
2708   VPRecipeBase *Def = VPV->getDefiningRecipe();
2709   assert(Def && "Must have definition for value defined inside vector region");
2710   if (auto Rep = dyn_cast<VPReplicateRecipe>(Def))
2711     return Rep->isUniform();
2712   return false;
2713 }
2714 } // end namespace vputils
2715 
2716 } // end namespace llvm
2717 
2718 #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
2719