1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/Analysis/TargetTransformInfoImpl.h"
27 #include "llvm/CodeGen/ISDOpcodes.h"
28 #include "llvm/CodeGen/TargetLowering.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/CodeGen/ValueTypes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/Type.h"
42 #include "llvm/IR/Value.h"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MachineValueType.h"
47 #include "llvm/Support/MathExtras.h"
48 #include <algorithm>
49 #include <cassert>
50 #include <cstdint>
51 #include <limits>
52 #include <utility>
53 
54 namespace llvm {
55 
56 class Function;
57 class GlobalValue;
58 class LLVMContext;
59 class ScalarEvolution;
60 class SCEV;
61 class TargetMachine;
62 
63 extern cl::opt<unsigned> PartialUnrollingThreshold;
64 
65 /// Base class which can be used to help build a TTI implementation.
66 ///
67 /// This class provides as much implementation of the TTI interface as is
68 /// possible using the target independent parts of the code generator.
69 ///
70 /// In order to subclass it, your class must implement a getST() method to
71 /// return the subtarget, and a getTLI() method to return the target lowering.
72 /// We need these methods implemented in the derived class so that this class
73 /// doesn't have to duplicate storage for them.
74 template <typename T>
75 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
76 private:
77   using BaseT = TargetTransformInfoImplCRTPBase<T>;
78   using TTI = TargetTransformInfo;
79 
80   /// Helper function to access this as a T.
thisT()81   T *thisT() { return static_cast<T *>(this); }
82 
83   /// Estimate a cost of Broadcast as an extract and sequence of insert
84   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy)85   unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) {
86     unsigned Cost = 0;
87     // Broadcast cost is equal to the cost of extracting the zero'th element
88     // plus the cost of inserting it into every element of the result vector.
89     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
90 
91     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
92       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
93     }
94     return Cost;
95   }
96 
97   /// Estimate a cost of shuffle as a sequence of extract and insert
98   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy)99   unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) {
100     unsigned Cost = 0;
101     // Shuffle cost is equal to the cost of extracting element from its argument
102     // plus the cost of inserting them onto the result vector.
103 
104     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
105     // index 0 of first vector, index 1 of second vector,index 2 of first
106     // vector and finally index 3 of second vector and insert them at index
107     // <0,1,2,3> of result vector.
108     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
109       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
110       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
111     }
112     return Cost;
113   }
114 
115   /// Estimate a cost of subvector extraction as a sequence of extract and
116   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)117   unsigned getExtractSubvectorOverhead(VectorType *VTy, int Index,
118                                        FixedVectorType *SubVTy) {
119     assert(VTy && SubVTy &&
120            "Can only extract subvectors from vectors");
121     int NumSubElts = SubVTy->getNumElements();
122     assert((!isa<FixedVectorType>(VTy) ||
123             (Index + NumSubElts) <=
124                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
125            "SK_ExtractSubvector index out of range");
126 
127     unsigned Cost = 0;
128     // Subvector extraction cost is equal to the cost of extracting element from
129     // the source type plus the cost of inserting them into the result vector
130     // type.
131     for (int i = 0; i != NumSubElts; ++i) {
132       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
133                                           i + Index);
134       Cost +=
135           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
136     }
137     return Cost;
138   }
139 
140   /// Estimate a cost of subvector insertion as a sequence of extract and
141   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)142   unsigned getInsertSubvectorOverhead(VectorType *VTy, int Index,
143                                       FixedVectorType *SubVTy) {
144     assert(VTy && SubVTy &&
145            "Can only insert subvectors into vectors");
146     int NumSubElts = SubVTy->getNumElements();
147     assert((!isa<FixedVectorType>(VTy) ||
148             (Index + NumSubElts) <=
149                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
150            "SK_InsertSubvector index out of range");
151 
152     unsigned Cost = 0;
153     // Subvector insertion cost is equal to the cost of extracting element from
154     // the source type plus the cost of inserting them into the result vector
155     // type.
156     for (int i = 0; i != NumSubElts; ++i) {
157       Cost +=
158           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
159       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
160                                           i + Index);
161     }
162     return Cost;
163   }
164 
165   /// Local query method delegates up to T which *must* implement this!
getST()166   const TargetSubtargetInfo *getST() const {
167     return static_cast<const T *>(this)->getST();
168   }
169 
170   /// Local query method delegates up to T which *must* implement this!
getTLI()171   const TargetLoweringBase *getTLI() const {
172     return static_cast<const T *>(this)->getTLI();
173   }
174 
getISDIndexedMode(TTI::MemIndexedMode M)175   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
176     switch (M) {
177       case TTI::MIM_Unindexed:
178         return ISD::UNINDEXED;
179       case TTI::MIM_PreInc:
180         return ISD::PRE_INC;
181       case TTI::MIM_PreDec:
182         return ISD::PRE_DEC;
183       case TTI::MIM_PostInc:
184         return ISD::POST_INC;
185       case TTI::MIM_PostDec:
186         return ISD::POST_DEC;
187     }
188     llvm_unreachable("Unexpected MemIndexedMode");
189   }
190 
191 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)192   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
193       : BaseT(DL) {}
194   virtual ~BasicTTIImplBase() = default;
195 
196   using TargetTransformInfoImplBase::DL;
197 
198 public:
199   /// \name Scalar TTI Implementations
200   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,unsigned Alignment,bool * Fast)201   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
202                                       unsigned AddressSpace, unsigned Alignment,
203                                       bool *Fast) const {
204     EVT E = EVT::getIntegerVT(Context, BitWidth);
205     return getTLI()->allowsMisalignedMemoryAccesses(
206         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
207   }
208 
hasBranchDivergence()209   bool hasBranchDivergence() { return false; }
210 
useGPUDivergenceAnalysis()211   bool useGPUDivergenceAnalysis() { return false; }
212 
isSourceOfDivergence(const Value * V)213   bool isSourceOfDivergence(const Value *V) { return false; }
214 
isAlwaysUniform(const Value * V)215   bool isAlwaysUniform(const Value *V) { return false; }
216 
getFlatAddressSpace()217   unsigned getFlatAddressSpace() {
218     // Return an invalid address space.
219     return -1;
220   }
221 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)222   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
223                                   Intrinsic::ID IID) const {
224     return false;
225   }
226 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)227   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
228     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
229   }
230 
getAssumedAddrSpace(const Value * V)231   unsigned getAssumedAddrSpace(const Value *V) const {
232     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
233   }
234 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)235   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
236                                           Value *NewV) const {
237     return nullptr;
238   }
239 
isLegalAddImmediate(int64_t imm)240   bool isLegalAddImmediate(int64_t imm) {
241     return getTLI()->isLegalAddImmediate(imm);
242   }
243 
isLegalICmpImmediate(int64_t imm)244   bool isLegalICmpImmediate(int64_t imm) {
245     return getTLI()->isLegalICmpImmediate(imm);
246   }
247 
248   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
249                              bool HasBaseReg, int64_t Scale,
250                              unsigned AddrSpace, Instruction *I = nullptr) {
251     TargetLoweringBase::AddrMode AM;
252     AM.BaseGV = BaseGV;
253     AM.BaseOffs = BaseOffset;
254     AM.HasBaseReg = HasBaseReg;
255     AM.Scale = Scale;
256     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
257   }
258 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)259   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
260                           const DataLayout &DL) const {
261     EVT VT = getTLI()->getValueType(DL, Ty);
262     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
263   }
264 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)265   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
266                            const DataLayout &DL) const {
267     EVT VT = getTLI()->getValueType(DL, Ty);
268     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
269   }
270 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)271   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
272     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
273   }
274 
isNumRegsMajorCostOfLSR()275   bool isNumRegsMajorCostOfLSR() {
276     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
277   }
278 
isProfitableLSRChainElement(Instruction * I)279   bool isProfitableLSRChainElement(Instruction *I) {
280     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
281   }
282 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)283   int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
284                            bool HasBaseReg, int64_t Scale, unsigned AddrSpace) {
285     TargetLoweringBase::AddrMode AM;
286     AM.BaseGV = BaseGV;
287     AM.BaseOffs = BaseOffset;
288     AM.HasBaseReg = HasBaseReg;
289     AM.Scale = Scale;
290     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
291   }
292 
isTruncateFree(Type * Ty1,Type * Ty2)293   bool isTruncateFree(Type *Ty1, Type *Ty2) {
294     return getTLI()->isTruncateFree(Ty1, Ty2);
295   }
296 
isProfitableToHoist(Instruction * I)297   bool isProfitableToHoist(Instruction *I) {
298     return getTLI()->isProfitableToHoist(I);
299   }
300 
useAA()301   bool useAA() const { return getST()->useAA(); }
302 
isTypeLegal(Type * Ty)303   bool isTypeLegal(Type *Ty) {
304     EVT VT = getTLI()->getValueType(DL, Ty);
305     return getTLI()->isTypeLegal(VT);
306   }
307 
getRegUsageForType(Type * Ty)308   unsigned getRegUsageForType(Type *Ty) {
309     return getTLI()->getTypeLegalizationCost(DL, Ty).first;
310   }
311 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)312   int getGEPCost(Type *PointeeType, const Value *Ptr,
313                  ArrayRef<const Value *> Operands) {
314     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
315   }
316 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)317   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
318                                             unsigned &JumpTableSize,
319                                             ProfileSummaryInfo *PSI,
320                                             BlockFrequencyInfo *BFI) {
321     /// Try to find the estimated number of clusters. Note that the number of
322     /// clusters identified in this function could be different from the actual
323     /// numbers found in lowering. This function ignore switches that are
324     /// lowered with a mix of jump table / bit test / BTree. This function was
325     /// initially intended to be used when estimating the cost of switch in
326     /// inline cost heuristic, but it's a generic cost model to be used in other
327     /// places (e.g., in loop unrolling).
328     unsigned N = SI.getNumCases();
329     const TargetLoweringBase *TLI = getTLI();
330     const DataLayout &DL = this->getDataLayout();
331 
332     JumpTableSize = 0;
333     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
334 
335     // Early exit if both a jump table and bit test are not allowed.
336     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
337       return N;
338 
339     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
340     APInt MinCaseVal = MaxCaseVal;
341     for (auto CI : SI.cases()) {
342       const APInt &CaseVal = CI.getCaseValue()->getValue();
343       if (CaseVal.sgt(MaxCaseVal))
344         MaxCaseVal = CaseVal;
345       if (CaseVal.slt(MinCaseVal))
346         MinCaseVal = CaseVal;
347     }
348 
349     // Check if suitable for a bit test
350     if (N <= DL.getIndexSizeInBits(0u)) {
351       SmallPtrSet<const BasicBlock *, 4> Dests;
352       for (auto I : SI.cases())
353         Dests.insert(I.getCaseSuccessor());
354 
355       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
356                                      DL))
357         return 1;
358     }
359 
360     // Check if suitable for a jump table.
361     if (IsJTAllowed) {
362       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
363         return N;
364       uint64_t Range =
365           (MaxCaseVal - MinCaseVal)
366               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
367       // Check whether a range of clusters is dense enough for a jump table
368       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
369         JumpTableSize = Range;
370         return 1;
371       }
372     }
373     return N;
374   }
375 
shouldBuildLookupTables()376   bool shouldBuildLookupTables() {
377     const TargetLoweringBase *TLI = getTLI();
378     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
379            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
380   }
381 
haveFastSqrt(Type * Ty)382   bool haveFastSqrt(Type *Ty) {
383     const TargetLoweringBase *TLI = getTLI();
384     EVT VT = TLI->getValueType(DL, Ty);
385     return TLI->isTypeLegal(VT) &&
386            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
387   }
388 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)389   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
390     return true;
391   }
392 
getFPOpCost(Type * Ty)393   unsigned getFPOpCost(Type *Ty) {
394     // Check whether FADD is available, as a proxy for floating-point in
395     // general.
396     const TargetLoweringBase *TLI = getTLI();
397     EVT VT = TLI->getValueType(DL, Ty);
398     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
399       return TargetTransformInfo::TCC_Basic;
400     return TargetTransformInfo::TCC_Expensive;
401   }
402 
getInliningThresholdMultiplier()403   unsigned getInliningThresholdMultiplier() { return 1; }
adjustInliningThreshold(const CallBase * CB)404   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
405 
getInlinerVectorBonusPercent()406   int getInlinerVectorBonusPercent() { return 150; }
407 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)408   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
409                                TTI::UnrollingPreferences &UP) {
410     // This unrolling functionality is target independent, but to provide some
411     // motivation for its intended use, for x86:
412 
413     // According to the Intel 64 and IA-32 Architectures Optimization Reference
414     // Manual, Intel Core models and later have a loop stream detector (and
415     // associated uop queue) that can benefit from partial unrolling.
416     // The relevant requirements are:
417     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
418     //    taken, and none of them may be calls.
419     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
420 
421     // According to the Software Optimization Guide for AMD Family 15h
422     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
423     // and loop buffer which can benefit from partial unrolling.
424     // The relevant requirements are:
425     //  - The loop must have fewer than 16 branches
426     //  - The loop must have less than 40 uops in all executed loop branches
427 
428     // The number of taken branches in a loop is hard to estimate here, and
429     // benchmarking has revealed that it is better not to be conservative when
430     // estimating the branch count. As a result, we'll ignore the branch limits
431     // until someone finds a case where it matters in practice.
432 
433     unsigned MaxOps;
434     const TargetSubtargetInfo *ST = getST();
435     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
436       MaxOps = PartialUnrollingThreshold;
437     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
438       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
439     else
440       return;
441 
442     // Scan the loop: don't unroll loops with calls.
443     for (BasicBlock *BB : L->blocks()) {
444       for (Instruction &I : *BB) {
445         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
446           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
447             if (!thisT()->isLoweredToCall(F))
448               continue;
449           }
450 
451           return;
452         }
453       }
454     }
455 
456     // Enable runtime and partial unrolling up to the specified size.
457     // Enable using trip count upper bound to unroll loops.
458     UP.Partial = UP.Runtime = UP.UpperBound = true;
459     UP.PartialThreshold = MaxOps;
460 
461     // Avoid unrolling when optimizing for size.
462     UP.OptSizeThreshold = 0;
463     UP.PartialOptSizeThreshold = 0;
464 
465     // Set number of instructions optimized when "back edge"
466     // becomes "fall through" to default value of 2.
467     UP.BEInsns = 2;
468   }
469 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)470   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
471                              TTI::PeelingPreferences &PP) {
472     PP.PeelCount = 0;
473     PP.AllowPeeling = true;
474     PP.AllowLoopNestsPeeling = false;
475     PP.PeelProfiledIterations = true;
476   }
477 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)478   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
479                                 AssumptionCache &AC,
480                                 TargetLibraryInfo *LibInfo,
481                                 HardwareLoopInfo &HWLoopInfo) {
482     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
483   }
484 
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)485   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
486                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
487                                    DominatorTree *DT,
488                                    const LoopAccessInfo *LAI) {
489     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
490   }
491 
emitGetActiveLaneMask()492   bool emitGetActiveLaneMask() {
493     return BaseT::emitGetActiveLaneMask();
494   }
495 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)496   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
497                                                IntrinsicInst &II) {
498     return BaseT::instCombineIntrinsic(IC, II);
499   }
500 
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)501   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
502                                                      IntrinsicInst &II,
503                                                      APInt DemandedMask,
504                                                      KnownBits &Known,
505                                                      bool &KnownBitsComputed) {
506     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
507                                                    KnownBitsComputed);
508   }
509 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)510   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
511       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
512       APInt &UndefElts2, APInt &UndefElts3,
513       std::function<void(Instruction *, unsigned, APInt, APInt &)>
514           SimplifyAndSetOp) {
515     return BaseT::simplifyDemandedVectorEltsIntrinsic(
516         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
517         SimplifyAndSetOp);
518   }
519 
getInstructionLatency(const Instruction * I)520   int getInstructionLatency(const Instruction *I) {
521     if (isa<LoadInst>(I))
522       return getST()->getSchedModel().DefaultLoadLatency;
523 
524     return BaseT::getInstructionLatency(I);
525   }
526 
527   virtual Optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)528   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
529     return Optional<unsigned>(
530       getST()->getCacheSize(static_cast<unsigned>(Level)));
531   }
532 
533   virtual Optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)534   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
535     Optional<unsigned> TargetResult =
536         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
537 
538     if (TargetResult)
539       return TargetResult;
540 
541     return BaseT::getCacheAssociativity(Level);
542   }
543 
getCacheLineSize()544   virtual unsigned getCacheLineSize() const {
545     return getST()->getCacheLineSize();
546   }
547 
getPrefetchDistance()548   virtual unsigned getPrefetchDistance() const {
549     return getST()->getPrefetchDistance();
550   }
551 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)552   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
553                                         unsigned NumStridedMemAccesses,
554                                         unsigned NumPrefetches,
555                                         bool HasCall) const {
556     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
557                                          NumPrefetches, HasCall);
558   }
559 
getMaxPrefetchIterationsAhead()560   virtual unsigned getMaxPrefetchIterationsAhead() const {
561     return getST()->getMaxPrefetchIterationsAhead();
562   }
563 
enableWritePrefetching()564   virtual bool enableWritePrefetching() const {
565     return getST()->enableWritePrefetching();
566   }
567 
568   /// @}
569 
570   /// \name Vector TTI Implementations
571   /// @{
572 
getRegisterBitWidth(bool Vector)573   unsigned getRegisterBitWidth(bool Vector) const { return 32; }
574 
getMaxVScale()575   Optional<unsigned> getMaxVScale() const { return None; }
576 
577   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
578   /// are set if the demanded result elements need to be inserted and/or
579   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)580   unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts,
581                                     bool Insert, bool Extract) {
582     /// FIXME: a bitfield is not a reasonable abstraction for talking about
583     /// which elements are needed from a scalable vector
584     auto *Ty = cast<FixedVectorType>(InTy);
585 
586     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
587            "Vector size mismatch");
588 
589     unsigned Cost = 0;
590 
591     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
592       if (!DemandedElts[i])
593         continue;
594       if (Insert)
595         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
596       if (Extract)
597         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
598     }
599 
600     return Cost;
601   }
602 
603   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)604   unsigned getScalarizationOverhead(VectorType *InTy, bool Insert,
605                                     bool Extract) {
606     auto *Ty = cast<FixedVectorType>(InTy);
607 
608     APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
609     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
610   }
611 
612   /// Estimate the overhead of scalarizing an instruction's unique
613   /// non-constant operands. The types of the arguments are ordinarily
614   /// scalar, in which case the costs are multiplied with VF.
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,unsigned VF)615   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
616                                             unsigned VF) {
617     unsigned Cost = 0;
618     SmallPtrSet<const Value*, 4> UniqueOperands;
619     for (const Value *A : Args) {
620       // Disregard things like metadata arguments.
621       Type *Ty = A->getType();
622       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
623           !Ty->isPtrOrPtrVectorTy())
624         continue;
625 
626       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
627         auto *VecTy = dyn_cast<VectorType>(Ty);
628         if (VecTy) {
629           // If A is a vector operand, VF should be 1 or correspond to A.
630           assert((VF == 1 ||
631                   VF == cast<FixedVectorType>(VecTy)->getNumElements()) &&
632                  "Vector argument does not match VF");
633         }
634         else
635           VecTy = FixedVectorType::get(Ty, VF);
636 
637         Cost += getScalarizationOverhead(VecTy, false, true);
638       }
639     }
640 
641     return Cost;
642   }
643 
getScalarizationOverhead(VectorType * InTy,ArrayRef<const Value * > Args)644   unsigned getScalarizationOverhead(VectorType *InTy,
645                                     ArrayRef<const Value *> Args) {
646     auto *Ty = cast<FixedVectorType>(InTy);
647 
648     unsigned Cost = 0;
649 
650     Cost += getScalarizationOverhead(Ty, true, false);
651     if (!Args.empty())
652       Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements());
653     else
654       // When no information on arguments is provided, we add the cost
655       // associated with one argument as a heuristic.
656       Cost += getScalarizationOverhead(Ty, false, true);
657 
658     return Cost;
659   }
660 
getMaxInterleaveFactor(unsigned VF)661   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
662 
663   unsigned getArithmeticInstrCost(
664       unsigned Opcode, Type *Ty,
665       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
666       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
667       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
668       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
669       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
670       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
671       const Instruction *CxtI = nullptr) {
672     // Check if any of the operands are vector operands.
673     const TargetLoweringBase *TLI = getTLI();
674     int ISD = TLI->InstructionOpcodeToISD(Opcode);
675     assert(ISD && "Invalid opcode");
676 
677     // TODO: Handle more cost kinds.
678     if (CostKind != TTI::TCK_RecipThroughput)
679       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
680                                            Opd1Info, Opd2Info,
681                                            Opd1PropInfo, Opd2PropInfo,
682                                            Args, CxtI);
683 
684     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
685 
686     bool IsFloat = Ty->isFPOrFPVectorTy();
687     // Assume that floating point arithmetic operations cost twice as much as
688     // integer operations.
689     unsigned OpCost = (IsFloat ? 2 : 1);
690 
691     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
692       // The operation is legal. Assume it costs 1.
693       // TODO: Once we have extract/insert subvector cost we need to use them.
694       return LT.first * OpCost;
695     }
696 
697     if (!TLI->isOperationExpand(ISD, LT.second)) {
698       // If the operation is custom lowered, then assume that the code is twice
699       // as expensive.
700       return LT.first * 2 * OpCost;
701     }
702 
703     // Else, assume that we need to scalarize this op.
704     // TODO: If one of the types get legalized by splitting, handle this
705     // similarly to what getCastInstrCost() does.
706     if (auto *VTy = dyn_cast<VectorType>(Ty)) {
707       unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
708       unsigned Cost = thisT()->getArithmeticInstrCost(
709           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
710           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
711       // Return the cost of multiple scalar invocation plus the cost of
712       // inserting and extracting the values.
713       return getScalarizationOverhead(VTy, Args) + Num * Cost;
714     }
715 
716     // We don't know anything about this scalar instruction.
717     return OpCost;
718   }
719 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,int Index,VectorType * SubTp)720   unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
721                           VectorType *SubTp) {
722 
723     switch (Kind) {
724     case TTI::SK_Broadcast:
725       return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
726     case TTI::SK_Select:
727     case TTI::SK_Reverse:
728     case TTI::SK_Transpose:
729     case TTI::SK_PermuteSingleSrc:
730     case TTI::SK_PermuteTwoSrc:
731       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
732     case TTI::SK_ExtractSubvector:
733       return getExtractSubvectorOverhead(Tp, Index,
734                                          cast<FixedVectorType>(SubTp));
735     case TTI::SK_InsertSubvector:
736       return getInsertSubvectorOverhead(Tp, Index,
737                                         cast<FixedVectorType>(SubTp));
738     }
739     llvm_unreachable("Unknown TTI::ShuffleKind");
740   }
741 
742   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
743                             TTI::CastContextHint CCH,
744                             TTI::TargetCostKind CostKind,
745                             const Instruction *I = nullptr) {
746     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
747       return 0;
748 
749     const TargetLoweringBase *TLI = getTLI();
750     int ISD = TLI->InstructionOpcodeToISD(Opcode);
751     assert(ISD && "Invalid opcode");
752     std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src);
753     std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst);
754 
755     TypeSize SrcSize = SrcLT.second.getSizeInBits();
756     TypeSize DstSize = DstLT.second.getSizeInBits();
757     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
758     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
759 
760     switch (Opcode) {
761     default:
762       break;
763     case Instruction::Trunc:
764       // Check for NOOP conversions.
765       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
766         return 0;
767       LLVM_FALLTHROUGH;
768     case Instruction::BitCast:
769       // Bitcast between types that are legalized to the same type are free and
770       // assume int to/from ptr of the same size is also free.
771       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
772           SrcSize == DstSize)
773         return 0;
774       break;
775     case Instruction::FPExt:
776       if (I && getTLI()->isExtFree(I))
777         return 0;
778       break;
779     case Instruction::ZExt:
780       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
781         return 0;
782       LLVM_FALLTHROUGH;
783     case Instruction::SExt:
784       if (I && getTLI()->isExtFree(I))
785         return 0;
786 
787       // If this is a zext/sext of a load, return 0 if the corresponding
788       // extending load exists on target.
789       if (CCH == TTI::CastContextHint::Normal) {
790         EVT ExtVT = EVT::getEVT(Dst);
791         EVT LoadVT = EVT::getEVT(Src);
792         unsigned LType =
793           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
794         if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
795           return 0;
796       }
797       break;
798     case Instruction::AddrSpaceCast:
799       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
800                                    Dst->getPointerAddressSpace()))
801         return 0;
802       break;
803     }
804 
805     auto *SrcVTy = dyn_cast<VectorType>(Src);
806     auto *DstVTy = dyn_cast<VectorType>(Dst);
807 
808     // If the cast is marked as legal (or promote) then assume low cost.
809     if (SrcLT.first == DstLT.first &&
810         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
811       return SrcLT.first;
812 
813     // Handle scalar conversions.
814     if (!SrcVTy && !DstVTy) {
815       // Just check the op cost. If the operation is legal then assume it costs
816       // 1.
817       if (!TLI->isOperationExpand(ISD, DstLT.second))
818         return 1;
819 
820       // Assume that illegal scalar instruction are expensive.
821       return 4;
822     }
823 
824     // Check vector-to-vector casts.
825     if (DstVTy && SrcVTy) {
826       // If the cast is between same-sized registers, then the check is simple.
827       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
828 
829         // Assume that Zext is done using AND.
830         if (Opcode == Instruction::ZExt)
831           return SrcLT.first;
832 
833         // Assume that sext is done using SHL and SRA.
834         if (Opcode == Instruction::SExt)
835           return SrcLT.first * 2;
836 
837         // Just check the op cost. If the operation is legal then assume it
838         // costs
839         // 1 and multiply by the type-legalization overhead.
840         if (!TLI->isOperationExpand(ISD, DstLT.second))
841           return SrcLT.first * 1;
842       }
843 
844       // If we are legalizing by splitting, query the concrete TTI for the cost
845       // of casting the original vector twice. We also need to factor in the
846       // cost of the split itself. Count that as 1, to be consistent with
847       // TLI->getTypeLegalizationCost().
848       bool SplitSrc =
849           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
850           TargetLowering::TypeSplitVector;
851       bool SplitDst =
852           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
853           TargetLowering::TypeSplitVector;
854       if ((SplitSrc || SplitDst) &&
855           cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 &&
856           cast<FixedVectorType>(DstVTy)->getNumElements() > 1) {
857         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
858         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
859         T *TTI = static_cast<T *>(this);
860         // If both types need to be split then the split is free.
861         unsigned SplitCost =
862             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
863         return SplitCost +
864                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
865                                           CostKind, I));
866       }
867 
868       // In other cases where the source or destination are illegal, assume
869       // the operation will get scalarized.
870       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
871       unsigned Cost = thisT()->getCastInstrCost(
872           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
873 
874       // Return the cost of multiple scalar invocation plus the cost of
875       // inserting and extracting the values.
876       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
877     }
878 
879     // We already handled vector-to-vector and scalar-to-scalar conversions.
880     // This
881     // is where we handle bitcast between vectors and scalars. We need to assume
882     //  that the conversion is scalarized in one way or another.
883     if (Opcode == Instruction::BitCast) {
884       // Illegal bitcasts are done by storing and loading from a stack slot.
885       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
886              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
887     }
888 
889     llvm_unreachable("Unhandled cast");
890   }
891 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)892   unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
893                                     VectorType *VecTy, unsigned Index) {
894     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
895                                        Index) +
896            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
897                                      TTI::CastContextHint::None, TTI::TCK_RecipThroughput);
898   }
899 
getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)900   unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
901     return BaseT::getCFInstrCost(Opcode, CostKind);
902   }
903 
904   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
905                               CmpInst::Predicate VecPred,
906                               TTI::TargetCostKind CostKind,
907                               const Instruction *I = nullptr) {
908     const TargetLoweringBase *TLI = getTLI();
909     int ISD = TLI->InstructionOpcodeToISD(Opcode);
910     assert(ISD && "Invalid opcode");
911 
912     // TODO: Handle other cost kinds.
913     if (CostKind != TTI::TCK_RecipThroughput)
914       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
915                                        I);
916 
917     // Selects on vectors are actually vector selects.
918     if (ISD == ISD::SELECT) {
919       assert(CondTy && "CondTy must exist");
920       if (CondTy->isVectorTy())
921         ISD = ISD::VSELECT;
922     }
923     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
924 
925     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
926         !TLI->isOperationExpand(ISD, LT.second)) {
927       // The operation is legal. Assume it costs 1. Multiply
928       // by the type-legalization overhead.
929       return LT.first * 1;
930     }
931 
932     // Otherwise, assume that the cast is scalarized.
933     // TODO: If one of the types get legalized by splitting, handle this
934     // similarly to what getCastInstrCost() does.
935     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
936       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
937       if (CondTy)
938         CondTy = CondTy->getScalarType();
939       unsigned Cost = thisT()->getCmpSelInstrCost(
940           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
941 
942       // Return the cost of multiple scalar invocation plus the cost of
943       // inserting and extracting the values.
944       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
945     }
946 
947     // Unknown scalar opcode.
948     return 1;
949   }
950 
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)951   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
952     std::pair<unsigned, MVT> LT =
953         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
954 
955     return LT.first;
956   }
957 
958   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
959                            unsigned AddressSpace,
960                            TTI::TargetCostKind CostKind,
961                            const Instruction *I = nullptr) {
962     assert(!Src->isVoidTy() && "Invalid type");
963     // Assume types, such as structs, are expensive.
964     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
965       return 4;
966     std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
967 
968     // Assuming that all loads of legal types cost 1.
969     unsigned Cost = LT.first;
970     if (CostKind != TTI::TCK_RecipThroughput)
971       return Cost;
972 
973     if (Src->isVectorTy() &&
974         // In practice it's not currently possible to have a change in lane
975         // length for extending loads or truncating stores so both types should
976         // have the same scalable property.
977         TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
978                             LT.second.getSizeInBits())) {
979       // This is a vector load that legalizes to a larger type than the vector
980       // itself. Unless the corresponding extending load or truncating store is
981       // legal, then this will scalarize.
982       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
983       EVT MemVT = getTLI()->getValueType(DL, Src);
984       if (Opcode == Instruction::Store)
985         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
986       else
987         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
988 
989       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
990         // This is a vector load/store for some illegal type that is scalarized.
991         // We must account for the cost of building or decomposing the vector.
992         Cost += getScalarizationOverhead(cast<VectorType>(Src),
993                                          Opcode != Instruction::Store,
994                                          Opcode == Instruction::Store);
995       }
996     }
997 
998     return Cost;
999   }
1000 
1001   unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1002                                   const Value *Ptr, bool VariableMask,
1003                                   Align Alignment, TTI::TargetCostKind CostKind,
1004                                   const Instruction *I = nullptr) {
1005     auto *VT = cast<FixedVectorType>(DataTy);
1006     // Assume the target does not have support for gather/scatter operations
1007     // and provide a rough estimate.
1008     //
1009     // First, compute the cost of extracting the individual addresses and the
1010     // individual memory operations.
1011     int LoadCost =
1012         VT->getNumElements() *
1013         (getVectorInstrCost(
1014              Instruction::ExtractElement,
1015              FixedVectorType::get(PointerType::get(VT->getElementType(), 0),
1016                                   VT->getNumElements()),
1017              -1) +
1018          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
1019 
1020     // Next, compute the cost of packing the result in a vector.
1021     int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store,
1022                                                Opcode == Instruction::Store);
1023 
1024     int ConditionalCost = 0;
1025     if (VariableMask) {
1026       // Compute the cost of conditionally executing the memory operations with
1027       // variable masks. This includes extracting the individual conditions, a
1028       // branches and PHIs to combine the results.
1029       // NOTE: Estimating the cost of conditionally executing the memory
1030       // operations accurately is quite difficult and the current solution
1031       // provides a very rough estimate only.
1032       ConditionalCost =
1033           VT->getNumElements() *
1034           (getVectorInstrCost(
1035                Instruction::ExtractElement,
1036                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
1037                                     VT->getNumElements()),
1038                -1) +
1039            getCFInstrCost(Instruction::Br, CostKind) +
1040            getCFInstrCost(Instruction::PHI, CostKind));
1041     }
1042 
1043     return LoadCost + PackingCost + ConditionalCost;
1044   }
1045 
1046   unsigned getInterleavedMemoryOpCost(
1047       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1048       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1049       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1050     auto *VT = cast<FixedVectorType>(VecTy);
1051 
1052     unsigned NumElts = VT->getNumElements();
1053     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1054 
1055     unsigned NumSubElts = NumElts / Factor;
1056     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1057 
1058     // Firstly, the cost of load/store operation.
1059     unsigned Cost;
1060     if (UseMaskForCond || UseMaskForGaps)
1061       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1062                                             AddressSpace, CostKind);
1063     else
1064       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1065                                       CostKind);
1066 
1067     // Legalize the vector type, and get the legalized and unlegalized type
1068     // sizes.
1069     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1070     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1071     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1072 
1073     // Return the ceiling of dividing A by B.
1074     auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
1075 
1076     // Scale the cost of the memory operation by the fraction of legalized
1077     // instructions that will actually be used. We shouldn't account for the
1078     // cost of dead instructions since they will be removed.
1079     //
1080     // E.g., An interleaved load of factor 8:
1081     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1082     //       %v0 = shufflevector %vec, undef, <0, 8>
1083     //
1084     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1085     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1086     // type). The other loads are unused.
1087     //
1088     // We only scale the cost of loads since interleaved store groups aren't
1089     // allowed to have gaps.
1090     if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1091       // The number of loads of a legal type it will take to represent a load
1092       // of the unlegalized vector type.
1093       unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize);
1094 
1095       // The number of elements of the unlegalized type that correspond to a
1096       // single legal instruction.
1097       unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts);
1098 
1099       // Determine which legal instructions will be used.
1100       BitVector UsedInsts(NumLegalInsts, false);
1101       for (unsigned Index : Indices)
1102         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1103           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1104 
1105       // Scale the cost of the load by the fraction of legal instructions that
1106       // will be used.
1107       Cost *= UsedInsts.count() / NumLegalInsts;
1108     }
1109 
1110     // Then plus the cost of interleave operation.
1111     if (Opcode == Instruction::Load) {
1112       // The interleave cost is similar to extract sub vectors' elements
1113       // from the wide vector, and insert them into sub vectors.
1114       //
1115       // E.g. An interleaved load of factor 2 (with one member of index 0):
1116       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1117       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1118       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1119       // <8 x i32> vector and insert them into a <4 x i32> vector.
1120 
1121       assert(Indices.size() <= Factor &&
1122              "Interleaved memory op has too many members");
1123 
1124       for (unsigned Index : Indices) {
1125         assert(Index < Factor && "Invalid index for interleaved memory op");
1126 
1127         // Extract elements from loaded vector for each sub vector.
1128         for (unsigned i = 0; i < NumSubElts; i++)
1129           Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1130                                               Index + i * Factor);
1131       }
1132 
1133       unsigned InsSubCost = 0;
1134       for (unsigned i = 0; i < NumSubElts; i++)
1135         InsSubCost +=
1136             thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1137 
1138       Cost += Indices.size() * InsSubCost;
1139     } else {
1140       // The interleave cost is extract all elements from sub vectors, and
1141       // insert them into the wide vector.
1142       //
1143       // E.g. An interleaved store of factor 2:
1144       //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1145       //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1146       // The cost is estimated as extract all elements from both <4 x i32>
1147       // vectors and insert into the <8 x i32> vector.
1148 
1149       unsigned ExtSubCost = 0;
1150       for (unsigned i = 0; i < NumSubElts; i++)
1151         ExtSubCost +=
1152             thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1153       Cost += ExtSubCost * Factor;
1154 
1155       for (unsigned i = 0; i < NumElts; i++)
1156         Cost += static_cast<T *>(this)
1157                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1158     }
1159 
1160     if (!UseMaskForCond)
1161       return Cost;
1162 
1163     Type *I8Type = Type::getInt8Ty(VT->getContext());
1164     auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1165     SubVT = FixedVectorType::get(I8Type, NumSubElts);
1166 
1167     // The Mask shuffling cost is extract all the elements of the Mask
1168     // and insert each of them Factor times into the wide vector:
1169     //
1170     // E.g. an interleaved group with factor 3:
1171     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1172     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1173     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1174     // The cost is estimated as extract all mask elements from the <8xi1> mask
1175     // vector and insert them factor times into the <24xi1> shuffled mask
1176     // vector.
1177     for (unsigned i = 0; i < NumSubElts; i++)
1178       Cost +=
1179           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1180 
1181     for (unsigned i = 0; i < NumElts; i++)
1182       Cost +=
1183           thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1184 
1185     // The Gaps mask is invariant and created outside the loop, therefore the
1186     // cost of creating it is not accounted for here. However if we have both
1187     // a MaskForGaps and some other mask that guards the execution of the
1188     // memory access, we need to account for the cost of And-ing the two masks
1189     // inside the loop.
1190     if (UseMaskForGaps)
1191       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1192                                               CostKind);
1193 
1194     return Cost;
1195   }
1196 
1197   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1198   unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1199                                  TTI::TargetCostKind CostKind) {
1200     // Check for generically free intrinsics.
1201     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1202       return 0;
1203 
1204     // Assume that target intrinsics are cheap.
1205     Intrinsic::ID IID = ICA.getID();
1206     if (Function::isTargetIntrinsic(IID))
1207       return TargetTransformInfo::TCC_Basic;
1208 
1209     if (ICA.isTypeBasedOnly())
1210       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1211 
1212     Type *RetTy = ICA.getReturnType();
1213 
1214     ElementCount VF = ICA.getVectorFactor();
1215     ElementCount RetVF =
1216         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1217                              : ElementCount::getFixed(1));
1218     assert((RetVF.isScalar() || VF.isScalar()) &&
1219            "VF > 1 and RetVF is a vector type");
1220     const IntrinsicInst *I = ICA.getInst();
1221     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1222     FastMathFlags FMF = ICA.getFlags();
1223     switch (IID) {
1224     default:
1225       break;
1226 
1227     case Intrinsic::cttz:
1228       // FIXME: If necessary, this should go in target-specific overrides.
1229       if (VF.isScalar() && RetVF.isScalar() &&
1230           getTLI()->isCheapToSpeculateCttz())
1231         return TargetTransformInfo::TCC_Basic;
1232       break;
1233 
1234     case Intrinsic::ctlz:
1235       // FIXME: If necessary, this should go in target-specific overrides.
1236       if (VF.isScalar() && RetVF.isScalar() &&
1237           getTLI()->isCheapToSpeculateCtlz())
1238         return TargetTransformInfo::TCC_Basic;
1239       break;
1240 
1241     case Intrinsic::memcpy:
1242       return thisT()->getMemcpyCost(ICA.getInst());
1243 
1244     case Intrinsic::masked_scatter: {
1245       assert(VF.isScalar() && "Can't vectorize types here.");
1246       const Value *Mask = Args[3];
1247       bool VarMask = !isa<Constant>(Mask);
1248       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1249       return thisT()->getGatherScatterOpCost(Instruction::Store,
1250                                              Args[0]->getType(), Args[1],
1251                                              VarMask, Alignment, CostKind, I);
1252     }
1253     case Intrinsic::masked_gather: {
1254       assert(VF.isScalar() && "Can't vectorize types here.");
1255       const Value *Mask = Args[2];
1256       bool VarMask = !isa<Constant>(Mask);
1257       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1258       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1259                                              VarMask, Alignment, CostKind, I);
1260     }
1261     case Intrinsic::experimental_vector_extract: {
1262       // FIXME: Handle case where a scalable vector is extracted from a scalable
1263       // vector
1264       if (isa<ScalableVectorType>(RetTy))
1265         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1266       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1267       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1268                                      cast<VectorType>(Args[0]->getType()),
1269                                      Index, cast<VectorType>(RetTy));
1270     }
1271     case Intrinsic::experimental_vector_insert: {
1272       // FIXME: Handle case where a scalable vector is inserted into a scalable
1273       // vector
1274       if (isa<ScalableVectorType>(Args[1]->getType()))
1275         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1276       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1277       return thisT()->getShuffleCost(
1278           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), Index,
1279           cast<VectorType>(Args[1]->getType()));
1280     }
1281     case Intrinsic::vector_reduce_add:
1282     case Intrinsic::vector_reduce_mul:
1283     case Intrinsic::vector_reduce_and:
1284     case Intrinsic::vector_reduce_or:
1285     case Intrinsic::vector_reduce_xor:
1286     case Intrinsic::vector_reduce_smax:
1287     case Intrinsic::vector_reduce_smin:
1288     case Intrinsic::vector_reduce_fmax:
1289     case Intrinsic::vector_reduce_fmin:
1290     case Intrinsic::vector_reduce_umax:
1291     case Intrinsic::vector_reduce_umin: {
1292       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I);
1293       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1294     }
1295     case Intrinsic::vector_reduce_fadd:
1296     case Intrinsic::vector_reduce_fmul: {
1297       IntrinsicCostAttributes Attrs(
1298           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I);
1299       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1300     }
1301     case Intrinsic::fshl:
1302     case Intrinsic::fshr: {
1303       if (isa<ScalableVectorType>(RetTy))
1304         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1305       const Value *X = Args[0];
1306       const Value *Y = Args[1];
1307       const Value *Z = Args[2];
1308       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1309       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1310       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1311       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1312       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1313       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1314                                                               : TTI::OP_None;
1315       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1316       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1317       unsigned Cost = 0;
1318       Cost +=
1319           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1320       Cost +=
1321           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1322       Cost += thisT()->getArithmeticInstrCost(
1323           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1324       Cost += thisT()->getArithmeticInstrCost(
1325           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1326       // Non-constant shift amounts requires a modulo.
1327       if (OpKindZ != TTI::OK_UniformConstantValue &&
1328           OpKindZ != TTI::OK_NonUniformConstantValue)
1329         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1330                                                 CostKind, OpKindZ, OpKindBW,
1331                                                 OpPropsZ, OpPropsBW);
1332       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1333       if (X != Y) {
1334         Type *CondTy = RetTy->getWithNewBitWidth(1);
1335         Cost +=
1336             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1337                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1338         Cost +=
1339             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1340                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1341       }
1342       return Cost;
1343     }
1344     }
1345     // TODO: Handle the remaining intrinsic with scalable vector type
1346     if (isa<ScalableVectorType>(RetTy))
1347       return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1348 
1349     // Assume that we need to scalarize this intrinsic.
1350     SmallVector<Type *, 4> Types;
1351     for (const Value *Op : Args) {
1352       Type *OpTy = Op->getType();
1353       assert(VF.isScalar() || !OpTy->isVectorTy());
1354       Types.push_back(VF.isScalar()
1355                           ? OpTy
1356                           : FixedVectorType::get(OpTy, VF.getKnownMinValue()));
1357     }
1358 
1359     if (VF.isVector() && !RetTy->isVoidTy())
1360       RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue());
1361 
1362     // Compute the scalarization overhead based on Args for a vector
1363     // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
1364     // CostModel will pass a vector RetTy and VF is 1.
1365     unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
1366     if (RetVF.isVector() || VF.isVector()) {
1367       ScalarizationCost = 0;
1368       if (!RetTy->isVoidTy())
1369         ScalarizationCost +=
1370             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1371       ScalarizationCost +=
1372           getOperandsScalarizationOverhead(Args, VF.getKnownMinValue());
1373     }
1374 
1375     IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I);
1376     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1377   }
1378 
1379   /// Get intrinsic cost based on argument types.
1380   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1381   /// cost of scalarizing the arguments and the return value will be computed
1382   /// based on types.
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1383   unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1384                                           TTI::TargetCostKind CostKind) {
1385     Intrinsic::ID IID = ICA.getID();
1386     Type *RetTy = ICA.getReturnType();
1387     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1388     FastMathFlags FMF = ICA.getFlags();
1389     unsigned ScalarizationCostPassed = ICA.getScalarizationCost();
1390     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1391 
1392     VectorType *VecOpTy = nullptr;
1393     if (!Tys.empty()) {
1394       // The vector reduction operand is operand 0 except for fadd/fmul.
1395       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1396       unsigned VecTyIndex = 0;
1397       if (IID == Intrinsic::vector_reduce_fadd ||
1398           IID == Intrinsic::vector_reduce_fmul)
1399         VecTyIndex = 1;
1400       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1401       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1402     }
1403 
1404     // Library call cost - other than size, make it expensive.
1405     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1406     SmallVector<unsigned, 2> ISDs;
1407     switch (IID) {
1408     default: {
1409       // Assume that we need to scalarize this intrinsic.
1410       unsigned ScalarizationCost = ScalarizationCostPassed;
1411       unsigned ScalarCalls = 1;
1412       Type *ScalarRetTy = RetTy;
1413       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1414         if (!SkipScalarizationCost)
1415           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1416         ScalarCalls = std::max(ScalarCalls,
1417                                cast<FixedVectorType>(RetVTy)->getNumElements());
1418         ScalarRetTy = RetTy->getScalarType();
1419       }
1420       SmallVector<Type *, 4> ScalarTys;
1421       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1422         Type *Ty = Tys[i];
1423         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1424           if (!SkipScalarizationCost)
1425             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1426           ScalarCalls = std::max(ScalarCalls,
1427                                  cast<FixedVectorType>(VTy)->getNumElements());
1428           Ty = Ty->getScalarType();
1429         }
1430         ScalarTys.push_back(Ty);
1431       }
1432       if (ScalarCalls == 1)
1433         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1434 
1435       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1436       unsigned ScalarCost =
1437           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1438 
1439       return ScalarCalls * ScalarCost + ScalarizationCost;
1440     }
1441     // Look for intrinsics that can be lowered directly or turned into a scalar
1442     // intrinsic call.
1443     case Intrinsic::sqrt:
1444       ISDs.push_back(ISD::FSQRT);
1445       break;
1446     case Intrinsic::sin:
1447       ISDs.push_back(ISD::FSIN);
1448       break;
1449     case Intrinsic::cos:
1450       ISDs.push_back(ISD::FCOS);
1451       break;
1452     case Intrinsic::exp:
1453       ISDs.push_back(ISD::FEXP);
1454       break;
1455     case Intrinsic::exp2:
1456       ISDs.push_back(ISD::FEXP2);
1457       break;
1458     case Intrinsic::log:
1459       ISDs.push_back(ISD::FLOG);
1460       break;
1461     case Intrinsic::log10:
1462       ISDs.push_back(ISD::FLOG10);
1463       break;
1464     case Intrinsic::log2:
1465       ISDs.push_back(ISD::FLOG2);
1466       break;
1467     case Intrinsic::fabs:
1468       ISDs.push_back(ISD::FABS);
1469       break;
1470     case Intrinsic::canonicalize:
1471       ISDs.push_back(ISD::FCANONICALIZE);
1472       break;
1473     case Intrinsic::minnum:
1474       ISDs.push_back(ISD::FMINNUM);
1475       break;
1476     case Intrinsic::maxnum:
1477       ISDs.push_back(ISD::FMAXNUM);
1478       break;
1479     case Intrinsic::minimum:
1480       ISDs.push_back(ISD::FMINIMUM);
1481       break;
1482     case Intrinsic::maximum:
1483       ISDs.push_back(ISD::FMAXIMUM);
1484       break;
1485     case Intrinsic::copysign:
1486       ISDs.push_back(ISD::FCOPYSIGN);
1487       break;
1488     case Intrinsic::floor:
1489       ISDs.push_back(ISD::FFLOOR);
1490       break;
1491     case Intrinsic::ceil:
1492       ISDs.push_back(ISD::FCEIL);
1493       break;
1494     case Intrinsic::trunc:
1495       ISDs.push_back(ISD::FTRUNC);
1496       break;
1497     case Intrinsic::nearbyint:
1498       ISDs.push_back(ISD::FNEARBYINT);
1499       break;
1500     case Intrinsic::rint:
1501       ISDs.push_back(ISD::FRINT);
1502       break;
1503     case Intrinsic::round:
1504       ISDs.push_back(ISD::FROUND);
1505       break;
1506     case Intrinsic::roundeven:
1507       ISDs.push_back(ISD::FROUNDEVEN);
1508       break;
1509     case Intrinsic::pow:
1510       ISDs.push_back(ISD::FPOW);
1511       break;
1512     case Intrinsic::fma:
1513       ISDs.push_back(ISD::FMA);
1514       break;
1515     case Intrinsic::fmuladd:
1516       ISDs.push_back(ISD::FMA);
1517       break;
1518     case Intrinsic::experimental_constrained_fmuladd:
1519       ISDs.push_back(ISD::STRICT_FMA);
1520       break;
1521     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1522     case Intrinsic::lifetime_start:
1523     case Intrinsic::lifetime_end:
1524     case Intrinsic::sideeffect:
1525     case Intrinsic::pseudoprobe:
1526       return 0;
1527     case Intrinsic::masked_store: {
1528       Type *Ty = Tys[0];
1529       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1530       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1531                                             CostKind);
1532     }
1533     case Intrinsic::masked_load: {
1534       Type *Ty = RetTy;
1535       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1536       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1537                                             CostKind);
1538     }
1539     case Intrinsic::vector_reduce_add:
1540       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1541                                                  /*IsPairwiseForm=*/false,
1542                                                  CostKind);
1543     case Intrinsic::vector_reduce_mul:
1544       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1545                                                  /*IsPairwiseForm=*/false,
1546                                                  CostKind);
1547     case Intrinsic::vector_reduce_and:
1548       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1549                                                  /*IsPairwiseForm=*/false,
1550                                                  CostKind);
1551     case Intrinsic::vector_reduce_or:
1552       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1553                                                  /*IsPairwiseForm=*/false,
1554                                                  CostKind);
1555     case Intrinsic::vector_reduce_xor:
1556       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1557                                                  /*IsPairwiseForm=*/false,
1558                                                  CostKind);
1559     case Intrinsic::vector_reduce_fadd:
1560       // FIXME: Add new flag for cost of strict reductions.
1561       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1562                                                  /*IsPairwiseForm=*/false,
1563                                                  CostKind);
1564     case Intrinsic::vector_reduce_fmul:
1565       // FIXME: Add new flag for cost of strict reductions.
1566       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1567                                                  /*IsPairwiseForm=*/false,
1568                                                  CostKind);
1569     case Intrinsic::vector_reduce_smax:
1570     case Intrinsic::vector_reduce_smin:
1571     case Intrinsic::vector_reduce_fmax:
1572     case Intrinsic::vector_reduce_fmin:
1573       return thisT()->getMinMaxReductionCost(
1574           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1575           /*IsPairwiseForm=*/false,
1576           /*IsUnsigned=*/false, CostKind);
1577     case Intrinsic::vector_reduce_umax:
1578     case Intrinsic::vector_reduce_umin:
1579       return thisT()->getMinMaxReductionCost(
1580           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1581           /*IsPairwiseForm=*/false,
1582           /*IsUnsigned=*/true, CostKind);
1583     case Intrinsic::abs:
1584     case Intrinsic::smax:
1585     case Intrinsic::smin:
1586     case Intrinsic::umax:
1587     case Intrinsic::umin: {
1588       // abs(X) = select(icmp(X,0),X,sub(0,X))
1589       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1590       Type *CondTy = RetTy->getWithNewBitWidth(1);
1591       unsigned Cost = 0;
1592       // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1593       Cost +=
1594           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1595                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1596       Cost +=
1597           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1598                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1599       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1600       if (IID == Intrinsic::abs)
1601         Cost += thisT()->getArithmeticInstrCost(
1602             BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1603       return Cost;
1604     }
1605     case Intrinsic::sadd_sat:
1606     case Intrinsic::ssub_sat: {
1607       Type *CondTy = RetTy->getWithNewBitWidth(1);
1608 
1609       Type *OpTy = StructType::create({RetTy, CondTy});
1610       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1611                                      ? Intrinsic::sadd_with_overflow
1612                                      : Intrinsic::ssub_with_overflow;
1613 
1614       // SatMax -> Overflow && SumDiff < 0
1615       // SatMin -> Overflow && SumDiff >= 0
1616       unsigned Cost = 0;
1617       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1618                                     ScalarizationCostPassed);
1619       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1620       Cost +=
1621           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1622                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1623       Cost += 2 * thisT()->getCmpSelInstrCost(
1624                       BinaryOperator::Select, RetTy, CondTy,
1625                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1626       return Cost;
1627     }
1628     case Intrinsic::uadd_sat:
1629     case Intrinsic::usub_sat: {
1630       Type *CondTy = RetTy->getWithNewBitWidth(1);
1631 
1632       Type *OpTy = StructType::create({RetTy, CondTy});
1633       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1634                                      ? Intrinsic::uadd_with_overflow
1635                                      : Intrinsic::usub_with_overflow;
1636 
1637       unsigned Cost = 0;
1638       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1639                                     ScalarizationCostPassed);
1640       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1641       Cost +=
1642           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1643                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1644       return Cost;
1645     }
1646     case Intrinsic::smul_fix:
1647     case Intrinsic::umul_fix: {
1648       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1649       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1650 
1651       unsigned ExtOp =
1652           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1653       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1654 
1655       unsigned Cost = 0;
1656       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1657       Cost +=
1658           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1659       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1660                                             CCH, CostKind);
1661       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1662                                               CostKind, TTI::OK_AnyValue,
1663                                               TTI::OK_UniformConstantValue);
1664       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1665                                               TTI::OK_AnyValue,
1666                                               TTI::OK_UniformConstantValue);
1667       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1668       return Cost;
1669     }
1670     case Intrinsic::sadd_with_overflow:
1671     case Intrinsic::ssub_with_overflow: {
1672       Type *SumTy = RetTy->getContainedType(0);
1673       Type *OverflowTy = RetTy->getContainedType(1);
1674       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1675                             ? BinaryOperator::Add
1676                             : BinaryOperator::Sub;
1677 
1678       //   LHSSign -> LHS >= 0
1679       //   RHSSign -> RHS >= 0
1680       //   SumSign -> Sum >= 0
1681       //
1682       //   Add:
1683       //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1684       //   Sub:
1685       //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1686       unsigned Cost = 0;
1687       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1688       Cost += 3 * thisT()->getCmpSelInstrCost(
1689                       Instruction::ICmp, SumTy, OverflowTy,
1690                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1691       Cost += 2 * thisT()->getCmpSelInstrCost(
1692                       Instruction::Select, OverflowTy, OverflowTy,
1693                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1694       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1695                                               CostKind);
1696       return Cost;
1697     }
1698     case Intrinsic::uadd_with_overflow:
1699     case Intrinsic::usub_with_overflow: {
1700       Type *SumTy = RetTy->getContainedType(0);
1701       Type *OverflowTy = RetTy->getContainedType(1);
1702       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1703                             ? BinaryOperator::Add
1704                             : BinaryOperator::Sub;
1705 
1706       unsigned Cost = 0;
1707       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1708       Cost +=
1709           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1710                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1711       return Cost;
1712     }
1713     case Intrinsic::smul_with_overflow:
1714     case Intrinsic::umul_with_overflow: {
1715       Type *MulTy = RetTy->getContainedType(0);
1716       Type *OverflowTy = RetTy->getContainedType(1);
1717       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1718       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1719 
1720       unsigned ExtOp =
1721           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1722       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1723 
1724       unsigned Cost = 0;
1725       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1726       Cost +=
1727           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1728       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1729                                             CCH, CostKind);
1730       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1731                                               CostKind, TTI::OK_AnyValue,
1732                                               TTI::OK_UniformConstantValue);
1733 
1734       if (IID == Intrinsic::smul_with_overflow)
1735         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1736                                                 CostKind, TTI::OK_AnyValue,
1737                                                 TTI::OK_UniformConstantValue);
1738 
1739       Cost +=
1740           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1741                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1742       return Cost;
1743     }
1744     case Intrinsic::ctpop:
1745       ISDs.push_back(ISD::CTPOP);
1746       // In case of legalization use TCC_Expensive. This is cheaper than a
1747       // library call but still not a cheap instruction.
1748       SingleCallCost = TargetTransformInfo::TCC_Expensive;
1749       break;
1750     case Intrinsic::ctlz:
1751       ISDs.push_back(ISD::CTLZ);
1752       break;
1753     case Intrinsic::cttz:
1754       ISDs.push_back(ISD::CTTZ);
1755       break;
1756     case Intrinsic::bswap:
1757       ISDs.push_back(ISD::BSWAP);
1758       break;
1759     case Intrinsic::bitreverse:
1760       ISDs.push_back(ISD::BITREVERSE);
1761       break;
1762     }
1763 
1764     const TargetLoweringBase *TLI = getTLI();
1765     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
1766 
1767     SmallVector<unsigned, 2> LegalCost;
1768     SmallVector<unsigned, 2> CustomCost;
1769     for (unsigned ISD : ISDs) {
1770       if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1771         if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1772             TLI->isFAbsFree(LT.second)) {
1773           return 0;
1774         }
1775 
1776         // The operation is legal. Assume it costs 1.
1777         // If the type is split to multiple registers, assume that there is some
1778         // overhead to this.
1779         // TODO: Once we have extract/insert subvector cost we need to use them.
1780         if (LT.first > 1)
1781           LegalCost.push_back(LT.first * 2);
1782         else
1783           LegalCost.push_back(LT.first * 1);
1784       } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1785         // If the operation is custom lowered then assume
1786         // that the code is twice as expensive.
1787         CustomCost.push_back(LT.first * 2);
1788       }
1789     }
1790 
1791     auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1792     if (MinLegalCostI != LegalCost.end())
1793       return *MinLegalCostI;
1794 
1795     auto MinCustomCostI =
1796         std::min_element(CustomCost.begin(), CustomCost.end());
1797     if (MinCustomCostI != CustomCost.end())
1798       return *MinCustomCostI;
1799 
1800     // If we can't lower fmuladd into an FMA estimate the cost as a floating
1801     // point mul followed by an add.
1802     if (IID == Intrinsic::fmuladd)
1803       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1804                                              CostKind) +
1805              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1806                                              CostKind);
1807     if (IID == Intrinsic::experimental_constrained_fmuladd) {
1808       IntrinsicCostAttributes FMulAttrs(
1809         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1810       IntrinsicCostAttributes FAddAttrs(
1811         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1812       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1813              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1814     }
1815 
1816     // Else, assume that we need to scalarize this intrinsic. For math builtins
1817     // this will emit a costly libcall, adding call overhead and spills. Make it
1818     // very expensive.
1819     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1820       unsigned ScalarizationCost = SkipScalarizationCost ?
1821         ScalarizationCostPassed : getScalarizationOverhead(RetVTy, true, false);
1822 
1823       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1824       SmallVector<Type *, 4> ScalarTys;
1825       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1826         Type *Ty = Tys[i];
1827         if (Ty->isVectorTy())
1828           Ty = Ty->getScalarType();
1829         ScalarTys.push_back(Ty);
1830       }
1831       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1832       unsigned ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1833       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1834         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1835           if (!ICA.skipScalarizationCost())
1836             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1837           ScalarCalls = std::max(ScalarCalls,
1838                                  cast<FixedVectorType>(VTy)->getNumElements());
1839         }
1840       }
1841       return ScalarCalls * ScalarCost + ScalarizationCost;
1842     }
1843 
1844     // This is going to be turned into a library call, make it expensive.
1845     return SingleCallCost;
1846   }
1847 
1848   /// Compute a cost of the given call instruction.
1849   ///
1850   /// Compute the cost of calling function F with return type RetTy and
1851   /// argument types Tys. F might be nullptr, in this case the cost of an
1852   /// arbitrary call with the specified signature will be returned.
1853   /// This is used, for instance,  when we estimate call of a vector
1854   /// counterpart of the given function.
1855   /// \param F Called function, might be nullptr.
1856   /// \param RetTy Return value types.
1857   /// \param Tys Argument types.
1858   /// \returns The cost of Call instruction.
1859   unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1860                      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1861     return 10;
1862   }
1863 
getNumberOfParts(Type * Tp)1864   unsigned getNumberOfParts(Type *Tp) {
1865     std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp);
1866     return LT.first;
1867   }
1868 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1869   unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *,
1870                                      const SCEV *) {
1871     return 0;
1872   }
1873 
1874   /// Try to calculate arithmetic and shuffle op costs for reduction operations.
1875   /// We're assuming that reduction operation are performing the following way:
1876   /// 1. Non-pairwise reduction
1877   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1878   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1879   ///            \----------------v-------------/  \----------v------------/
1880   ///                            n/2 elements               n/2 elements
1881   /// %red1 = op <n x t> %val, <n x t> val1
1882   /// After this operation we have a vector %red1 where only the first n/2
1883   /// elements are meaningful, the second n/2 elements are undefined and can be
1884   /// dropped. All other operations are actually working with the vector of
1885   /// length n/2, not n, though the real vector length is still n.
1886   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
1887   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
1888   ///            \----------------v-------------/  \----------v------------/
1889   ///                            n/4 elements               3*n/4 elements
1890   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
1891   /// length n/2, the resulting vector has length n/4 etc.
1892   /// 2. Pairwise reduction:
1893   /// Everything is the same except for an additional shuffle operation which
1894   /// is used to produce operands for pairwise kind of reductions.
1895   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1896   /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef>
1897   ///            \-------------v----------/  \----------v------------/
1898   ///                   n/2 elements               n/2 elements
1899   /// %val2 = shufflevector<n x t> %val, <n x t> %undef,
1900   /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef>
1901   ///            \-------------v----------/  \----------v------------/
1902   ///                   n/2 elements               n/2 elements
1903   /// %red1 = op <n x t> %val1, <n x t> val2
1904   /// Again, the operation is performed on <n x t> vector, but the resulting
1905   /// vector %red1 is <n/2 x t> vector.
1906   ///
1907   /// The cost model should take into account that the actual length of the
1908   /// vector is reduced on each iteration.
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,bool IsPairwise,TTI::TargetCostKind CostKind)1909   unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
1910                                       bool IsPairwise,
1911                                       TTI::TargetCostKind CostKind) {
1912     Type *ScalarTy = Ty->getElementType();
1913     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
1914     unsigned NumReduxLevels = Log2_32(NumVecElts);
1915     unsigned ArithCost = 0;
1916     unsigned ShuffleCost = 0;
1917     std::pair<unsigned, MVT> LT =
1918         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
1919     unsigned LongVectorCount = 0;
1920     unsigned MVTLen =
1921         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1922     while (NumVecElts > MVTLen) {
1923       NumVecElts /= 2;
1924       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
1925       // Assume the pairwise shuffles add a cost.
1926       ShuffleCost +=
1927           (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1928                                                      Ty, NumVecElts, SubTy);
1929       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
1930       Ty = SubTy;
1931       ++LongVectorCount;
1932     }
1933 
1934     NumReduxLevels -= LongVectorCount;
1935 
1936     // The minimal length of the vector is limited by the real length of vector
1937     // operations performed on the current platform. That's why several final
1938     // reduction operations are performed on the vectors with the same
1939     // architecture-dependent length.
1940 
1941     // Non pairwise reductions need one shuffle per reduction level. Pairwise
1942     // reductions need two shuffles on every level, but the last one. On that
1943     // level one of the shuffles is <0, u, u, ...> which is identity.
1944     unsigned NumShuffles = NumReduxLevels;
1945     if (IsPairwise && NumReduxLevels >= 1)
1946       NumShuffles += NumReduxLevels - 1;
1947     ShuffleCost += NumShuffles *
1948                    thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty);
1949     ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
1950     return ShuffleCost + ArithCost +
1951            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
1952   }
1953 
1954   /// Try to calculate op costs for min/max reduction operations.
1955   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsPairwise,bool IsUnsigned,TTI::TargetCostKind CostKind)1956   unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
1957                                   bool IsPairwise, bool IsUnsigned,
1958                                   TTI::TargetCostKind CostKind) {
1959     Type *ScalarTy = Ty->getElementType();
1960     Type *ScalarCondTy = CondTy->getElementType();
1961     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
1962     unsigned NumReduxLevels = Log2_32(NumVecElts);
1963     unsigned CmpOpcode;
1964     if (Ty->isFPOrFPVectorTy()) {
1965       CmpOpcode = Instruction::FCmp;
1966     } else {
1967       assert(Ty->isIntOrIntVectorTy() &&
1968              "expecting floating point or integer type for min/max reduction");
1969       CmpOpcode = Instruction::ICmp;
1970     }
1971     unsigned MinMaxCost = 0;
1972     unsigned ShuffleCost = 0;
1973     std::pair<unsigned, MVT> LT =
1974         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
1975     unsigned LongVectorCount = 0;
1976     unsigned MVTLen =
1977         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
1978     while (NumVecElts > MVTLen) {
1979       NumVecElts /= 2;
1980       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
1981       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
1982 
1983       // Assume the pairwise shuffles add a cost.
1984       ShuffleCost +=
1985           (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1986                                                      Ty, NumVecElts, SubTy);
1987       MinMaxCost +=
1988           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
1989                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
1990           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
1991                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1992       Ty = SubTy;
1993       ++LongVectorCount;
1994     }
1995 
1996     NumReduxLevels -= LongVectorCount;
1997 
1998     // The minimal length of the vector is limited by the real length of vector
1999     // operations performed on the current platform. That's why several final
2000     // reduction opertions are perfomed on the vectors with the same
2001     // architecture-dependent length.
2002 
2003     // Non pairwise reductions need one shuffle per reduction level. Pairwise
2004     // reductions need two shuffles on every level, but the last one. On that
2005     // level one of the shuffles is <0, u, u, ...> which is identity.
2006     unsigned NumShuffles = NumReduxLevels;
2007     if (IsPairwise && NumReduxLevels >= 1)
2008       NumShuffles += NumReduxLevels - 1;
2009     ShuffleCost += NumShuffles *
2010                    thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty);
2011     MinMaxCost +=
2012         NumReduxLevels *
2013         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2014                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2015          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2016                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2017     // The last min/max should be in vector registers and we counted it above.
2018     // So just need a single extractelement.
2019     return ShuffleCost + MinMaxCost +
2020            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2021   }
2022 
getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2023   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2024                                               Type *ResTy, VectorType *Ty,
2025                                               TTI::TargetCostKind CostKind) {
2026     // Without any native support, this is equivalent to the cost of
2027     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2028     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2029     unsigned RedCost = thisT()->getArithmeticReductionCost(
2030         Instruction::Add, ExtTy, false, CostKind);
2031     unsigned MulCost = 0;
2032     unsigned ExtCost = thisT()->getCastInstrCost(
2033         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2034         TTI::CastContextHint::None, CostKind);
2035     if (IsMLA) {
2036       MulCost =
2037           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2038       ExtCost *= 2;
2039     }
2040 
2041     return RedCost + MulCost + ExtCost;
2042   }
2043 
getVectorSplitCost()2044   unsigned getVectorSplitCost() { return 1; }
2045 
2046   /// @}
2047 };
2048 
2049 /// Concrete BasicTTIImpl that can be used if no further customization
2050 /// is needed.
2051 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2052   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2053 
2054   friend class BasicTTIImplBase<BasicTTIImpl>;
2055 
2056   const TargetSubtargetInfo *ST;
2057   const TargetLoweringBase *TLI;
2058 
getST()2059   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2060   const TargetLoweringBase *getTLI() const { return TLI; }
2061 
2062 public:
2063   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2064 };
2065 
2066 } // end namespace llvm
2067 
2068 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2069