1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/TargetTransformInfo.h"
26 #include "llvm/Analysis/TargetTransformInfoImpl.h"
27 #include "llvm/CodeGen/ISDOpcodes.h"
28 #include "llvm/CodeGen/TargetLowering.h"
29 #include "llvm/CodeGen/TargetSubtargetInfo.h"
30 #include "llvm/CodeGen/ValueTypes.h"
31 #include "llvm/IR/BasicBlock.h"
32 #include "llvm/IR/Constant.h"
33 #include "llvm/IR/Constants.h"
34 #include "llvm/IR/DataLayout.h"
35 #include "llvm/IR/DerivedTypes.h"
36 #include "llvm/IR/InstrTypes.h"
37 #include "llvm/IR/Instruction.h"
38 #include "llvm/IR/Instructions.h"
39 #include "llvm/IR/Intrinsics.h"
40 #include "llvm/IR/Operator.h"
41 #include "llvm/IR/Type.h"
42 #include "llvm/IR/Value.h"
43 #include "llvm/Support/Casting.h"
44 #include "llvm/Support/CommandLine.h"
45 #include "llvm/Support/ErrorHandling.h"
46 #include "llvm/Support/MachineValueType.h"
47 #include "llvm/Support/MathExtras.h"
48 #include "llvm/Target/TargetMachine.h"
49 #include <algorithm>
50 #include <cassert>
51 #include <cstdint>
52 #include <limits>
53 #include <utility>
54 
55 namespace llvm {
56 
57 class Function;
58 class GlobalValue;
59 class LLVMContext;
60 class ScalarEvolution;
61 class SCEV;
62 class TargetMachine;
63 
64 extern cl::opt<unsigned> PartialUnrollingThreshold;
65 
66 /// Base class which can be used to help build a TTI implementation.
67 ///
68 /// This class provides as much implementation of the TTI interface as is
69 /// possible using the target independent parts of the code generator.
70 ///
71 /// In order to subclass it, your class must implement a getST() method to
72 /// return the subtarget, and a getTLI() method to return the target lowering.
73 /// We need these methods implemented in the derived class so that this class
74 /// doesn't have to duplicate storage for them.
75 template <typename T>
76 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
77 private:
78   using BaseT = TargetTransformInfoImplCRTPBase<T>;
79   using TTI = TargetTransformInfo;
80 
81   /// Helper function to access this as a T.
thisT()82   T *thisT() { return static_cast<T *>(this); }
83 
84   /// Estimate a cost of Broadcast as an extract and sequence of insert
85   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy)86   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
87     InstructionCost Cost = 0;
88     // Broadcast cost is equal to the cost of extracting the zero'th element
89     // plus the cost of inserting it into every element of the result vector.
90     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
91 
92     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
93       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
94     }
95     return Cost;
96   }
97 
98   /// Estimate a cost of shuffle as a sequence of extract and insert
99   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy)100   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
101     InstructionCost Cost = 0;
102     // Shuffle cost is equal to the cost of extracting element from its argument
103     // plus the cost of inserting them onto the result vector.
104 
105     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
106     // index 0 of first vector, index 1 of second vector,index 2 of first
107     // vector and finally index 3 of second vector and insert them at index
108     // <0,1,2,3> of result vector.
109     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
110       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
111       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
112     }
113     return Cost;
114   }
115 
116   /// Estimate a cost of subvector extraction as a sequence of extract and
117   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)118   InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
119                                        FixedVectorType *SubVTy) {
120     assert(VTy && SubVTy &&
121            "Can only extract subvectors from vectors");
122     int NumSubElts = SubVTy->getNumElements();
123     assert((!isa<FixedVectorType>(VTy) ||
124             (Index + NumSubElts) <=
125                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
126            "SK_ExtractSubvector index out of range");
127 
128     InstructionCost Cost = 0;
129     // Subvector extraction cost is equal to the cost of extracting element from
130     // the source type plus the cost of inserting them into the result vector
131     // type.
132     for (int i = 0; i != NumSubElts; ++i) {
133       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
134                                           i + Index);
135       Cost +=
136           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
137     }
138     return Cost;
139   }
140 
141   /// Estimate a cost of subvector insertion as a sequence of extract and
142   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)143   InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
144                                       FixedVectorType *SubVTy) {
145     assert(VTy && SubVTy &&
146            "Can only insert subvectors into vectors");
147     int NumSubElts = SubVTy->getNumElements();
148     assert((!isa<FixedVectorType>(VTy) ||
149             (Index + NumSubElts) <=
150                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
151            "SK_InsertSubvector index out of range");
152 
153     InstructionCost Cost = 0;
154     // Subvector insertion cost is equal to the cost of extracting element from
155     // the source type plus the cost of inserting them into the result vector
156     // type.
157     for (int i = 0; i != NumSubElts; ++i) {
158       Cost +=
159           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
160       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
161                                           i + Index);
162     }
163     return Cost;
164   }
165 
166   /// Local query method delegates up to T which *must* implement this!
getST()167   const TargetSubtargetInfo *getST() const {
168     return static_cast<const T *>(this)->getST();
169   }
170 
171   /// Local query method delegates up to T which *must* implement this!
getTLI()172   const TargetLoweringBase *getTLI() const {
173     return static_cast<const T *>(this)->getTLI();
174   }
175 
getISDIndexedMode(TTI::MemIndexedMode M)176   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
177     switch (M) {
178       case TTI::MIM_Unindexed:
179         return ISD::UNINDEXED;
180       case TTI::MIM_PreInc:
181         return ISD::PRE_INC;
182       case TTI::MIM_PreDec:
183         return ISD::PRE_DEC;
184       case TTI::MIM_PostInc:
185         return ISD::POST_INC;
186       case TTI::MIM_PostDec:
187         return ISD::POST_DEC;
188     }
189     llvm_unreachable("Unexpected MemIndexedMode");
190   }
191 
getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)192   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
193                                               Align Alignment,
194                                               bool VariableMask,
195                                               bool IsGatherScatter,
196                                               TTI::TargetCostKind CostKind) {
197     auto *VT = cast<FixedVectorType>(DataTy);
198     // Assume the target does not have support for gather/scatter operations
199     // and provide a rough estimate.
200     //
201     // First, compute the cost of the individual memory operations.
202     InstructionCost AddrExtractCost =
203         IsGatherScatter
204             ? getVectorInstrCost(Instruction::ExtractElement,
205                                  FixedVectorType::get(
206                                      PointerType::get(VT->getElementType(), 0),
207                                      VT->getNumElements()),
208                                  -1)
209             : 0;
210     InstructionCost LoadCost =
211         VT->getNumElements() *
212         (AddrExtractCost +
213          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
214 
215     // Next, compute the cost of packing the result in a vector.
216     InstructionCost PackingCost = getScalarizationOverhead(
217         VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
218 
219     InstructionCost ConditionalCost = 0;
220     if (VariableMask) {
221       // Compute the cost of conditionally executing the memory operations with
222       // variable masks. This includes extracting the individual conditions, a
223       // branches and PHIs to combine the results.
224       // NOTE: Estimating the cost of conditionally executing the memory
225       // operations accurately is quite difficult and the current solution
226       // provides a very rough estimate only.
227       ConditionalCost =
228           VT->getNumElements() *
229           (getVectorInstrCost(
230                Instruction::ExtractElement,
231                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
232                                     VT->getNumElements()),
233                -1) +
234            getCFInstrCost(Instruction::Br, CostKind) +
235            getCFInstrCost(Instruction::PHI, CostKind));
236     }
237 
238     return LoadCost + PackingCost + ConditionalCost;
239   }
240 
241 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)242   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
243       : BaseT(DL) {}
244   virtual ~BasicTTIImplBase() = default;
245 
246   using TargetTransformInfoImplBase::DL;
247 
248 public:
249   /// \name Scalar TTI Implementations
250   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,bool * Fast)251   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
252                                       unsigned AddressSpace, Align Alignment,
253                                       bool *Fast) const {
254     EVT E = EVT::getIntegerVT(Context, BitWidth);
255     return getTLI()->allowsMisalignedMemoryAccesses(
256         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
257   }
258 
hasBranchDivergence()259   bool hasBranchDivergence() { return false; }
260 
useGPUDivergenceAnalysis()261   bool useGPUDivergenceAnalysis() { return false; }
262 
isSourceOfDivergence(const Value * V)263   bool isSourceOfDivergence(const Value *V) { return false; }
264 
isAlwaysUniform(const Value * V)265   bool isAlwaysUniform(const Value *V) { return false; }
266 
getFlatAddressSpace()267   unsigned getFlatAddressSpace() {
268     // Return an invalid address space.
269     return -1;
270   }
271 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)272   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
273                                   Intrinsic::ID IID) const {
274     return false;
275   }
276 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)277   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
278     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
279   }
280 
getAssumedAddrSpace(const Value * V)281   unsigned getAssumedAddrSpace(const Value *V) const {
282     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
283   }
284 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)285   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
286                                           Value *NewV) const {
287     return nullptr;
288   }
289 
isLegalAddImmediate(int64_t imm)290   bool isLegalAddImmediate(int64_t imm) {
291     return getTLI()->isLegalAddImmediate(imm);
292   }
293 
isLegalICmpImmediate(int64_t imm)294   bool isLegalICmpImmediate(int64_t imm) {
295     return getTLI()->isLegalICmpImmediate(imm);
296   }
297 
298   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
299                              bool HasBaseReg, int64_t Scale,
300                              unsigned AddrSpace, Instruction *I = nullptr) {
301     TargetLoweringBase::AddrMode AM;
302     AM.BaseGV = BaseGV;
303     AM.BaseOffs = BaseOffset;
304     AM.HasBaseReg = HasBaseReg;
305     AM.Scale = Scale;
306     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
307   }
308 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)309   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
310                           const DataLayout &DL) const {
311     EVT VT = getTLI()->getValueType(DL, Ty);
312     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
313   }
314 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)315   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
316                            const DataLayout &DL) const {
317     EVT VT = getTLI()->getValueType(DL, Ty);
318     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
319   }
320 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)321   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
322     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
323   }
324 
isNumRegsMajorCostOfLSR()325   bool isNumRegsMajorCostOfLSR() {
326     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
327   }
328 
isProfitableLSRChainElement(Instruction * I)329   bool isProfitableLSRChainElement(Instruction *I) {
330     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
331   }
332 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)333   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
334                                        int64_t BaseOffset, bool HasBaseReg,
335                                        int64_t Scale, unsigned AddrSpace) {
336     TargetLoweringBase::AddrMode AM;
337     AM.BaseGV = BaseGV;
338     AM.BaseOffs = BaseOffset;
339     AM.HasBaseReg = HasBaseReg;
340     AM.Scale = Scale;
341     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
342   }
343 
isTruncateFree(Type * Ty1,Type * Ty2)344   bool isTruncateFree(Type *Ty1, Type *Ty2) {
345     return getTLI()->isTruncateFree(Ty1, Ty2);
346   }
347 
isProfitableToHoist(Instruction * I)348   bool isProfitableToHoist(Instruction *I) {
349     return getTLI()->isProfitableToHoist(I);
350   }
351 
useAA()352   bool useAA() const { return getST()->useAA(); }
353 
isTypeLegal(Type * Ty)354   bool isTypeLegal(Type *Ty) {
355     EVT VT = getTLI()->getValueType(DL, Ty);
356     return getTLI()->isTypeLegal(VT);
357   }
358 
getRegUsageForType(Type * Ty)359   InstructionCost getRegUsageForType(Type *Ty) {
360     InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
361     assert(Val >= 0 && "Negative cost!");
362     return Val;
363   }
364 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)365   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
366                              ArrayRef<const Value *> Operands) {
367     return BaseT::getGEPCost(PointeeType, Ptr, Operands);
368   }
369 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)370   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
371                                             unsigned &JumpTableSize,
372                                             ProfileSummaryInfo *PSI,
373                                             BlockFrequencyInfo *BFI) {
374     /// Try to find the estimated number of clusters. Note that the number of
375     /// clusters identified in this function could be different from the actual
376     /// numbers found in lowering. This function ignore switches that are
377     /// lowered with a mix of jump table / bit test / BTree. This function was
378     /// initially intended to be used when estimating the cost of switch in
379     /// inline cost heuristic, but it's a generic cost model to be used in other
380     /// places (e.g., in loop unrolling).
381     unsigned N = SI.getNumCases();
382     const TargetLoweringBase *TLI = getTLI();
383     const DataLayout &DL = this->getDataLayout();
384 
385     JumpTableSize = 0;
386     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
387 
388     // Early exit if both a jump table and bit test are not allowed.
389     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
390       return N;
391 
392     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
393     APInt MinCaseVal = MaxCaseVal;
394     for (auto CI : SI.cases()) {
395       const APInt &CaseVal = CI.getCaseValue()->getValue();
396       if (CaseVal.sgt(MaxCaseVal))
397         MaxCaseVal = CaseVal;
398       if (CaseVal.slt(MinCaseVal))
399         MinCaseVal = CaseVal;
400     }
401 
402     // Check if suitable for a bit test
403     if (N <= DL.getIndexSizeInBits(0u)) {
404       SmallPtrSet<const BasicBlock *, 4> Dests;
405       for (auto I : SI.cases())
406         Dests.insert(I.getCaseSuccessor());
407 
408       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
409                                      DL))
410         return 1;
411     }
412 
413     // Check if suitable for a jump table.
414     if (IsJTAllowed) {
415       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
416         return N;
417       uint64_t Range =
418           (MaxCaseVal - MinCaseVal)
419               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
420       // Check whether a range of clusters is dense enough for a jump table
421       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
422         JumpTableSize = Range;
423         return 1;
424       }
425     }
426     return N;
427   }
428 
shouldBuildLookupTables()429   bool shouldBuildLookupTables() {
430     const TargetLoweringBase *TLI = getTLI();
431     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
432            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
433   }
434 
shouldBuildRelLookupTables()435   bool shouldBuildRelLookupTables() const {
436     const TargetMachine &TM = getTLI()->getTargetMachine();
437     // If non-PIC mode, do not generate a relative lookup table.
438     if (!TM.isPositionIndependent())
439       return false;
440 
441     /// Relative lookup table entries consist of 32-bit offsets.
442     /// Do not generate relative lookup tables for large code models
443     /// in 64-bit achitectures where 32-bit offsets might not be enough.
444     if (TM.getCodeModel() == CodeModel::Medium ||
445         TM.getCodeModel() == CodeModel::Large)
446       return false;
447 
448     Triple TargetTriple = TM.getTargetTriple();
449     if (!TargetTriple.isArch64Bit())
450       return false;
451 
452     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
453     // there.
454     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
455       return false;
456 
457     return true;
458   }
459 
haveFastSqrt(Type * Ty)460   bool haveFastSqrt(Type *Ty) {
461     const TargetLoweringBase *TLI = getTLI();
462     EVT VT = TLI->getValueType(DL, Ty);
463     return TLI->isTypeLegal(VT) &&
464            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
465   }
466 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)467   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
468     return true;
469   }
470 
getFPOpCost(Type * Ty)471   InstructionCost getFPOpCost(Type *Ty) {
472     // Check whether FADD is available, as a proxy for floating-point in
473     // general.
474     const TargetLoweringBase *TLI = getTLI();
475     EVT VT = TLI->getValueType(DL, Ty);
476     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
477       return TargetTransformInfo::TCC_Basic;
478     return TargetTransformInfo::TCC_Expensive;
479   }
480 
getInliningThresholdMultiplier()481   unsigned getInliningThresholdMultiplier() { return 1; }
adjustInliningThreshold(const CallBase * CB)482   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
483 
getInlinerVectorBonusPercent()484   int getInlinerVectorBonusPercent() { return 150; }
485 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)486   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
487                                TTI::UnrollingPreferences &UP) {
488     // This unrolling functionality is target independent, but to provide some
489     // motivation for its intended use, for x86:
490 
491     // According to the Intel 64 and IA-32 Architectures Optimization Reference
492     // Manual, Intel Core models and later have a loop stream detector (and
493     // associated uop queue) that can benefit from partial unrolling.
494     // The relevant requirements are:
495     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
496     //    taken, and none of them may be calls.
497     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
498 
499     // According to the Software Optimization Guide for AMD Family 15h
500     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
501     // and loop buffer which can benefit from partial unrolling.
502     // The relevant requirements are:
503     //  - The loop must have fewer than 16 branches
504     //  - The loop must have less than 40 uops in all executed loop branches
505 
506     // The number of taken branches in a loop is hard to estimate here, and
507     // benchmarking has revealed that it is better not to be conservative when
508     // estimating the branch count. As a result, we'll ignore the branch limits
509     // until someone finds a case where it matters in practice.
510 
511     unsigned MaxOps;
512     const TargetSubtargetInfo *ST = getST();
513     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
514       MaxOps = PartialUnrollingThreshold;
515     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
516       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
517     else
518       return;
519 
520     // Scan the loop: don't unroll loops with calls.
521     for (BasicBlock *BB : L->blocks()) {
522       for (Instruction &I : *BB) {
523         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
524           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
525             if (!thisT()->isLoweredToCall(F))
526               continue;
527           }
528 
529           return;
530         }
531       }
532     }
533 
534     // Enable runtime and partial unrolling up to the specified size.
535     // Enable using trip count upper bound to unroll loops.
536     UP.Partial = UP.Runtime = UP.UpperBound = true;
537     UP.PartialThreshold = MaxOps;
538 
539     // Avoid unrolling when optimizing for size.
540     UP.OptSizeThreshold = 0;
541     UP.PartialOptSizeThreshold = 0;
542 
543     // Set number of instructions optimized when "back edge"
544     // becomes "fall through" to default value of 2.
545     UP.BEInsns = 2;
546   }
547 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)548   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
549                              TTI::PeelingPreferences &PP) {
550     PP.PeelCount = 0;
551     PP.AllowPeeling = true;
552     PP.AllowLoopNestsPeeling = false;
553     PP.PeelProfiledIterations = true;
554   }
555 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)556   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
557                                 AssumptionCache &AC,
558                                 TargetLibraryInfo *LibInfo,
559                                 HardwareLoopInfo &HWLoopInfo) {
560     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
561   }
562 
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)563   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
564                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
565                                    DominatorTree *DT,
566                                    const LoopAccessInfo *LAI) {
567     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
568   }
569 
emitGetActiveLaneMask()570   bool emitGetActiveLaneMask() {
571     return BaseT::emitGetActiveLaneMask();
572   }
573 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)574   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
575                                                IntrinsicInst &II) {
576     return BaseT::instCombineIntrinsic(IC, II);
577   }
578 
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)579   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
580                                                      IntrinsicInst &II,
581                                                      APInt DemandedMask,
582                                                      KnownBits &Known,
583                                                      bool &KnownBitsComputed) {
584     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
585                                                    KnownBitsComputed);
586   }
587 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)588   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
589       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
590       APInt &UndefElts2, APInt &UndefElts3,
591       std::function<void(Instruction *, unsigned, APInt, APInt &)>
592           SimplifyAndSetOp) {
593     return BaseT::simplifyDemandedVectorEltsIntrinsic(
594         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
595         SimplifyAndSetOp);
596   }
597 
getInstructionLatency(const Instruction * I)598   InstructionCost getInstructionLatency(const Instruction *I) {
599     if (isa<LoadInst>(I))
600       return getST()->getSchedModel().DefaultLoadLatency;
601 
602     return BaseT::getInstructionLatency(I);
603   }
604 
605   virtual Optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)606   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
607     return Optional<unsigned>(
608       getST()->getCacheSize(static_cast<unsigned>(Level)));
609   }
610 
611   virtual Optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)612   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
613     Optional<unsigned> TargetResult =
614         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
615 
616     if (TargetResult)
617       return TargetResult;
618 
619     return BaseT::getCacheAssociativity(Level);
620   }
621 
getCacheLineSize()622   virtual unsigned getCacheLineSize() const {
623     return getST()->getCacheLineSize();
624   }
625 
getPrefetchDistance()626   virtual unsigned getPrefetchDistance() const {
627     return getST()->getPrefetchDistance();
628   }
629 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)630   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
631                                         unsigned NumStridedMemAccesses,
632                                         unsigned NumPrefetches,
633                                         bool HasCall) const {
634     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
635                                          NumPrefetches, HasCall);
636   }
637 
getMaxPrefetchIterationsAhead()638   virtual unsigned getMaxPrefetchIterationsAhead() const {
639     return getST()->getMaxPrefetchIterationsAhead();
640   }
641 
enableWritePrefetching()642   virtual bool enableWritePrefetching() const {
643     return getST()->enableWritePrefetching();
644   }
645 
646   /// @}
647 
648   /// \name Vector TTI Implementations
649   /// @{
650 
getRegisterBitWidth(TargetTransformInfo::RegisterKind K)651   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
652     return TypeSize::getFixed(32);
653   }
654 
getMaxVScale()655   Optional<unsigned> getMaxVScale() const { return None; }
656 
657   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
658   /// are set if the demanded result elements need to be inserted and/or
659   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)660   InstructionCost getScalarizationOverhead(VectorType *InTy,
661                                            const APInt &DemandedElts,
662                                            bool Insert, bool Extract) {
663     /// FIXME: a bitfield is not a reasonable abstraction for talking about
664     /// which elements are needed from a scalable vector
665     auto *Ty = cast<FixedVectorType>(InTy);
666 
667     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
668            "Vector size mismatch");
669 
670     InstructionCost Cost = 0;
671 
672     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
673       if (!DemandedElts[i])
674         continue;
675       if (Insert)
676         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
677       if (Extract)
678         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
679     }
680 
681     return Cost;
682   }
683 
684   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)685   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
686                                            bool Extract) {
687     auto *Ty = cast<FixedVectorType>(InTy);
688 
689     APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements());
690     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
691   }
692 
693   /// Estimate the overhead of scalarizing an instructions unique
694   /// non-constant operands. The (potentially vector) types to use for each of
695   /// argument are passes via Tys.
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)696   InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
697                                                    ArrayRef<Type *> Tys) {
698     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
699 
700     InstructionCost Cost = 0;
701     SmallPtrSet<const Value*, 4> UniqueOperands;
702     for (int I = 0, E = Args.size(); I != E; I++) {
703       // Disregard things like metadata arguments.
704       const Value *A = Args[I];
705       Type *Ty = Tys[I];
706       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
707           !Ty->isPtrOrPtrVectorTy())
708         continue;
709 
710       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
711         if (auto *VecTy = dyn_cast<VectorType>(Ty))
712           Cost += getScalarizationOverhead(VecTy, false, true);
713       }
714     }
715 
716     return Cost;
717   }
718 
719   /// Estimate the overhead of scalarizing the inputs and outputs of an
720   /// instruction, with return type RetTy and arguments Args of type Tys. If
721   /// Args are unknown (empty), then the cost associated with one argument is
722   /// added as a heuristic.
getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)723   InstructionCost getScalarizationOverhead(VectorType *RetTy,
724                                            ArrayRef<const Value *> Args,
725                                            ArrayRef<Type *> Tys) {
726     InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
727     if (!Args.empty())
728       Cost += getOperandsScalarizationOverhead(Args, Tys);
729     else
730       // When no information on arguments is provided, we add the cost
731       // associated with one argument as a heuristic.
732       Cost += getScalarizationOverhead(RetTy, false, true);
733 
734     return Cost;
735   }
736 
getMaxInterleaveFactor(unsigned VF)737   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
738 
739   InstructionCost getArithmeticInstrCost(
740       unsigned Opcode, Type *Ty,
741       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
742       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
743       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
744       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
745       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
746       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
747       const Instruction *CxtI = nullptr) {
748     // Check if any of the operands are vector operands.
749     const TargetLoweringBase *TLI = getTLI();
750     int ISD = TLI->InstructionOpcodeToISD(Opcode);
751     assert(ISD && "Invalid opcode");
752 
753     // TODO: Handle more cost kinds.
754     if (CostKind != TTI::TCK_RecipThroughput)
755       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
756                                            Opd1Info, Opd2Info,
757                                            Opd1PropInfo, Opd2PropInfo,
758                                            Args, CxtI);
759 
760     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
761 
762     bool IsFloat = Ty->isFPOrFPVectorTy();
763     // Assume that floating point arithmetic operations cost twice as much as
764     // integer operations.
765     InstructionCost OpCost = (IsFloat ? 2 : 1);
766 
767     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
768       // The operation is legal. Assume it costs 1.
769       // TODO: Once we have extract/insert subvector cost we need to use them.
770       return LT.first * OpCost;
771     }
772 
773     if (!TLI->isOperationExpand(ISD, LT.second)) {
774       // If the operation is custom lowered, then assume that the code is twice
775       // as expensive.
776       return LT.first * 2 * OpCost;
777     }
778 
779     // An 'Expand' of URem and SRem is special because it may default
780     // to expanding the operation into a sequence of sub-operations
781     // i.e. X % Y -> X-(X/Y)*Y.
782     if (ISD == ISD::UREM || ISD == ISD::SREM) {
783       bool IsSigned = ISD == ISD::SREM;
784       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
785                                         LT.second) ||
786           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
787                                         LT.second)) {
788         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
789         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
790             DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
791             Opd2PropInfo);
792         InstructionCost MulCost =
793             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
794         InstructionCost SubCost =
795             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
796         return DivCost + MulCost + SubCost;
797       }
798     }
799 
800     // We cannot scalarize scalable vectors, so return Invalid.
801     if (isa<ScalableVectorType>(Ty))
802       return InstructionCost::getInvalid();
803 
804     // Else, assume that we need to scalarize this op.
805     // TODO: If one of the types get legalized by splitting, handle this
806     // similarly to what getCastInstrCost() does.
807     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
808       InstructionCost Cost = thisT()->getArithmeticInstrCost(
809           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
810           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
811       // Return the cost of multiple scalar invocation plus the cost of
812       // inserting and extracting the values.
813       SmallVector<Type *> Tys(Args.size(), Ty);
814       return getScalarizationOverhead(VTy, Args, Tys) +
815              VTy->getNumElements() * Cost;
816     }
817 
818     // We don't know anything about this scalar instruction.
819     return OpCost;
820   }
821 
improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask)822   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
823                                               ArrayRef<int> Mask) const {
824     int Limit = Mask.size() * 2;
825     if (Mask.empty() ||
826         // Extra check required by isSingleSourceMaskImpl function (called by
827         // ShuffleVectorInst::isSingleSourceMask).
828         any_of(Mask, [Limit](int I) { return I >= Limit; }))
829       return Kind;
830     switch (Kind) {
831     case TTI::SK_PermuteSingleSrc:
832       if (ShuffleVectorInst::isReverseMask(Mask))
833         return TTI::SK_Reverse;
834       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
835         return TTI::SK_Broadcast;
836       break;
837     case TTI::SK_PermuteTwoSrc:
838       if (ShuffleVectorInst::isSelectMask(Mask))
839         return TTI::SK_Select;
840       if (ShuffleVectorInst::isTransposeMask(Mask))
841         return TTI::SK_Transpose;
842       break;
843     case TTI::SK_Select:
844     case TTI::SK_Reverse:
845     case TTI::SK_Broadcast:
846     case TTI::SK_Transpose:
847     case TTI::SK_InsertSubvector:
848     case TTI::SK_ExtractSubvector:
849     case TTI::SK_Splice:
850       break;
851     }
852     return Kind;
853   }
854 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp)855   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
856                                  ArrayRef<int> Mask, int Index,
857                                  VectorType *SubTp) {
858 
859     switch (improveShuffleKindFromMask(Kind, Mask)) {
860     case TTI::SK_Broadcast:
861       return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
862     case TTI::SK_Select:
863     case TTI::SK_Splice:
864     case TTI::SK_Reverse:
865     case TTI::SK_Transpose:
866     case TTI::SK_PermuteSingleSrc:
867     case TTI::SK_PermuteTwoSrc:
868       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
869     case TTI::SK_ExtractSubvector:
870       return getExtractSubvectorOverhead(Tp, Index,
871                                          cast<FixedVectorType>(SubTp));
872     case TTI::SK_InsertSubvector:
873       return getInsertSubvectorOverhead(Tp, Index,
874                                         cast<FixedVectorType>(SubTp));
875     }
876     llvm_unreachable("Unknown TTI::ShuffleKind");
877   }
878 
879   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
880                                    TTI::CastContextHint CCH,
881                                    TTI::TargetCostKind CostKind,
882                                    const Instruction *I = nullptr) {
883     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
884       return 0;
885 
886     const TargetLoweringBase *TLI = getTLI();
887     int ISD = TLI->InstructionOpcodeToISD(Opcode);
888     assert(ISD && "Invalid opcode");
889     std::pair<InstructionCost, MVT> SrcLT =
890         TLI->getTypeLegalizationCost(DL, Src);
891     std::pair<InstructionCost, MVT> DstLT =
892         TLI->getTypeLegalizationCost(DL, Dst);
893 
894     TypeSize SrcSize = SrcLT.second.getSizeInBits();
895     TypeSize DstSize = DstLT.second.getSizeInBits();
896     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
897     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
898 
899     switch (Opcode) {
900     default:
901       break;
902     case Instruction::Trunc:
903       // Check for NOOP conversions.
904       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
905         return 0;
906       LLVM_FALLTHROUGH;
907     case Instruction::BitCast:
908       // Bitcast between types that are legalized to the same type are free and
909       // assume int to/from ptr of the same size is also free.
910       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
911           SrcSize == DstSize)
912         return 0;
913       break;
914     case Instruction::FPExt:
915       if (I && getTLI()->isExtFree(I))
916         return 0;
917       break;
918     case Instruction::ZExt:
919       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
920         return 0;
921       LLVM_FALLTHROUGH;
922     case Instruction::SExt:
923       if (I && getTLI()->isExtFree(I))
924         return 0;
925 
926       // If this is a zext/sext of a load, return 0 if the corresponding
927       // extending load exists on target and the result type is legal.
928       if (CCH == TTI::CastContextHint::Normal) {
929         EVT ExtVT = EVT::getEVT(Dst);
930         EVT LoadVT = EVT::getEVT(Src);
931         unsigned LType =
932           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
933         if (DstLT.first == SrcLT.first &&
934             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
935           return 0;
936       }
937       break;
938     case Instruction::AddrSpaceCast:
939       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
940                                    Dst->getPointerAddressSpace()))
941         return 0;
942       break;
943     }
944 
945     auto *SrcVTy = dyn_cast<VectorType>(Src);
946     auto *DstVTy = dyn_cast<VectorType>(Dst);
947 
948     // If the cast is marked as legal (or promote) then assume low cost.
949     if (SrcLT.first == DstLT.first &&
950         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
951       return SrcLT.first;
952 
953     // Handle scalar conversions.
954     if (!SrcVTy && !DstVTy) {
955       // Just check the op cost. If the operation is legal then assume it costs
956       // 1.
957       if (!TLI->isOperationExpand(ISD, DstLT.second))
958         return 1;
959 
960       // Assume that illegal scalar instruction are expensive.
961       return 4;
962     }
963 
964     // Check vector-to-vector casts.
965     if (DstVTy && SrcVTy) {
966       // If the cast is between same-sized registers, then the check is simple.
967       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
968 
969         // Assume that Zext is done using AND.
970         if (Opcode == Instruction::ZExt)
971           return SrcLT.first;
972 
973         // Assume that sext is done using SHL and SRA.
974         if (Opcode == Instruction::SExt)
975           return SrcLT.first * 2;
976 
977         // Just check the op cost. If the operation is legal then assume it
978         // costs
979         // 1 and multiply by the type-legalization overhead.
980         if (!TLI->isOperationExpand(ISD, DstLT.second))
981           return SrcLT.first * 1;
982       }
983 
984       // If we are legalizing by splitting, query the concrete TTI for the cost
985       // of casting the original vector twice. We also need to factor in the
986       // cost of the split itself. Count that as 1, to be consistent with
987       // TLI->getTypeLegalizationCost().
988       bool SplitSrc =
989           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
990           TargetLowering::TypeSplitVector;
991       bool SplitDst =
992           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
993           TargetLowering::TypeSplitVector;
994       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
995           DstVTy->getElementCount().isVector()) {
996         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
997         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
998         T *TTI = static_cast<T *>(this);
999         // If both types need to be split then the split is free.
1000         InstructionCost SplitCost =
1001             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1002         return SplitCost +
1003                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1004                                           CostKind, I));
1005       }
1006 
1007       // Scalarization cost is Invalid, can't assume any num elements.
1008       if (isa<ScalableVectorType>(DstVTy))
1009         return InstructionCost::getInvalid();
1010 
1011       // In other cases where the source or destination are illegal, assume
1012       // the operation will get scalarized.
1013       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1014       InstructionCost Cost = thisT()->getCastInstrCost(
1015           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1016 
1017       // Return the cost of multiple scalar invocation plus the cost of
1018       // inserting and extracting the values.
1019       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1020     }
1021 
1022     // We already handled vector-to-vector and scalar-to-scalar conversions.
1023     // This
1024     // is where we handle bitcast between vectors and scalars. We need to assume
1025     //  that the conversion is scalarized in one way or another.
1026     if (Opcode == Instruction::BitCast) {
1027       // Illegal bitcasts are done by storing and loading from a stack slot.
1028       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1029              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1030     }
1031 
1032     llvm_unreachable("Unhandled cast");
1033   }
1034 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1035   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1036                                            VectorType *VecTy, unsigned Index) {
1037     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1038                                        Index) +
1039            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1040                                      TTI::CastContextHint::None,
1041                                      TTI::TCK_RecipThroughput);
1042   }
1043 
1044   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1045                                  const Instruction *I = nullptr) {
1046     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1047   }
1048 
1049   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1050                                      CmpInst::Predicate VecPred,
1051                                      TTI::TargetCostKind CostKind,
1052                                      const Instruction *I = nullptr) {
1053     const TargetLoweringBase *TLI = getTLI();
1054     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1055     assert(ISD && "Invalid opcode");
1056 
1057     // TODO: Handle other cost kinds.
1058     if (CostKind != TTI::TCK_RecipThroughput)
1059       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1060                                        I);
1061 
1062     // Selects on vectors are actually vector selects.
1063     if (ISD == ISD::SELECT) {
1064       assert(CondTy && "CondTy must exist");
1065       if (CondTy->isVectorTy())
1066         ISD = ISD::VSELECT;
1067     }
1068     std::pair<InstructionCost, MVT> LT =
1069         TLI->getTypeLegalizationCost(DL, ValTy);
1070 
1071     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1072         !TLI->isOperationExpand(ISD, LT.second)) {
1073       // The operation is legal. Assume it costs 1. Multiply
1074       // by the type-legalization overhead.
1075       return LT.first * 1;
1076     }
1077 
1078     // Otherwise, assume that the cast is scalarized.
1079     // TODO: If one of the types get legalized by splitting, handle this
1080     // similarly to what getCastInstrCost() does.
1081     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1082       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1083       if (CondTy)
1084         CondTy = CondTy->getScalarType();
1085       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1086           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1087 
1088       // Return the cost of multiple scalar invocation plus the cost of
1089       // inserting and extracting the values.
1090       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1091     }
1092 
1093     // Unknown scalar opcode.
1094     return 1;
1095   }
1096 
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1097   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1098                                      unsigned Index) {
1099     std::pair<InstructionCost, MVT> LT =
1100         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1101 
1102     return LT.first;
1103   }
1104 
1105   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1106                                   MaybeAlign Alignment, unsigned AddressSpace,
1107                                   TTI::TargetCostKind CostKind,
1108                                   const Instruction *I = nullptr) {
1109     assert(!Src->isVoidTy() && "Invalid type");
1110     // Assume types, such as structs, are expensive.
1111     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1112       return 4;
1113     std::pair<InstructionCost, MVT> LT =
1114         getTLI()->getTypeLegalizationCost(DL, Src);
1115 
1116     // Assuming that all loads of legal types cost 1.
1117     InstructionCost Cost = LT.first;
1118     if (CostKind != TTI::TCK_RecipThroughput)
1119       return Cost;
1120 
1121     if (Src->isVectorTy() &&
1122         // In practice it's not currently possible to have a change in lane
1123         // length for extending loads or truncating stores so both types should
1124         // have the same scalable property.
1125         TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1126                             LT.second.getSizeInBits())) {
1127       // This is a vector load that legalizes to a larger type than the vector
1128       // itself. Unless the corresponding extending load or truncating store is
1129       // legal, then this will scalarize.
1130       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1131       EVT MemVT = getTLI()->getValueType(DL, Src);
1132       if (Opcode == Instruction::Store)
1133         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1134       else
1135         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1136 
1137       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1138         // This is a vector load/store for some illegal type that is scalarized.
1139         // We must account for the cost of building or decomposing the vector.
1140         Cost += getScalarizationOverhead(cast<VectorType>(Src),
1141                                          Opcode != Instruction::Store,
1142                                          Opcode == Instruction::Store);
1143       }
1144     }
1145 
1146     return Cost;
1147   }
1148 
getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1149   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1150                                         Align Alignment, unsigned AddressSpace,
1151                                         TTI::TargetCostKind CostKind) {
1152     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1153                                        CostKind);
1154   }
1155 
1156   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1157                                          const Value *Ptr, bool VariableMask,
1158                                          Align Alignment,
1159                                          TTI::TargetCostKind CostKind,
1160                                          const Instruction *I = nullptr) {
1161     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1162                                        true, CostKind);
1163   }
1164 
1165   InstructionCost getInterleavedMemoryOpCost(
1166       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1167       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1168       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1169     auto *VT = cast<FixedVectorType>(VecTy);
1170 
1171     unsigned NumElts = VT->getNumElements();
1172     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1173 
1174     unsigned NumSubElts = NumElts / Factor;
1175     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1176 
1177     // Firstly, the cost of load/store operation.
1178     InstructionCost Cost;
1179     if (UseMaskForCond || UseMaskForGaps)
1180       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1181                                             AddressSpace, CostKind);
1182     else
1183       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1184                                       CostKind);
1185 
1186     // Legalize the vector type, and get the legalized and unlegalized type
1187     // sizes.
1188     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1189     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1190     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1191 
1192     // Scale the cost of the memory operation by the fraction of legalized
1193     // instructions that will actually be used. We shouldn't account for the
1194     // cost of dead instructions since they will be removed.
1195     //
1196     // E.g., An interleaved load of factor 8:
1197     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1198     //       %v0 = shufflevector %vec, undef, <0, 8>
1199     //
1200     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1201     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1202     // type). The other loads are unused.
1203     //
1204     // We only scale the cost of loads since interleaved store groups aren't
1205     // allowed to have gaps.
1206     if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) {
1207       // The number of loads of a legal type it will take to represent a load
1208       // of the unlegalized vector type.
1209       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1210 
1211       // The number of elements of the unlegalized type that correspond to a
1212       // single legal instruction.
1213       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1214 
1215       // Determine which legal instructions will be used.
1216       BitVector UsedInsts(NumLegalInsts, false);
1217       for (unsigned Index : Indices)
1218         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1219           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1220 
1221       // Scale the cost of the load by the fraction of legal instructions that
1222       // will be used.
1223       Cost *= UsedInsts.count() / NumLegalInsts;
1224     }
1225 
1226     // Then plus the cost of interleave operation.
1227     if (Opcode == Instruction::Load) {
1228       // The interleave cost is similar to extract sub vectors' elements
1229       // from the wide vector, and insert them into sub vectors.
1230       //
1231       // E.g. An interleaved load of factor 2 (with one member of index 0):
1232       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1233       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1234       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1235       // <8 x i32> vector and insert them into a <4 x i32> vector.
1236 
1237       assert(Indices.size() <= Factor &&
1238              "Interleaved memory op has too many members");
1239 
1240       for (unsigned Index : Indices) {
1241         assert(Index < Factor && "Invalid index for interleaved memory op");
1242 
1243         // Extract elements from loaded vector for each sub vector.
1244         for (unsigned i = 0; i < NumSubElts; i++)
1245           Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT,
1246                                               Index + i * Factor);
1247       }
1248 
1249       InstructionCost InsSubCost = 0;
1250       for (unsigned i = 0; i < NumSubElts; i++)
1251         InsSubCost +=
1252             thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i);
1253 
1254       Cost += Indices.size() * InsSubCost;
1255     } else {
1256       // The interleave cost is extract all elements from sub vectors, and
1257       // insert them into the wide vector.
1258       //
1259       // E.g. An interleaved store of factor 2:
1260       //      %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7>
1261       //      store <8 x i32> %interleaved.vec, <8 x i32>* %ptr
1262       // The cost is estimated as extract all elements from both <4 x i32>
1263       // vectors and insert into the <8 x i32> vector.
1264 
1265       InstructionCost ExtSubCost = 0;
1266       for (unsigned i = 0; i < NumSubElts; i++)
1267         ExtSubCost +=
1268             thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1269       Cost += ExtSubCost * Factor;
1270 
1271       for (unsigned i = 0; i < NumElts; i++)
1272         Cost += static_cast<T *>(this)
1273                     ->getVectorInstrCost(Instruction::InsertElement, VT, i);
1274     }
1275 
1276     if (!UseMaskForCond)
1277       return Cost;
1278 
1279     Type *I8Type = Type::getInt8Ty(VT->getContext());
1280     auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1281     SubVT = FixedVectorType::get(I8Type, NumSubElts);
1282 
1283     // The Mask shuffling cost is extract all the elements of the Mask
1284     // and insert each of them Factor times into the wide vector:
1285     //
1286     // E.g. an interleaved group with factor 3:
1287     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1288     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1289     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1290     // The cost is estimated as extract all mask elements from the <8xi1> mask
1291     // vector and insert them factor times into the <24xi1> shuffled mask
1292     // vector.
1293     for (unsigned i = 0; i < NumSubElts; i++)
1294       Cost +=
1295           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i);
1296 
1297     for (unsigned i = 0; i < NumElts; i++)
1298       Cost +=
1299           thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i);
1300 
1301     // The Gaps mask is invariant and created outside the loop, therefore the
1302     // cost of creating it is not accounted for here. However if we have both
1303     // a MaskForGaps and some other mask that guards the execution of the
1304     // memory access, we need to account for the cost of And-ing the two masks
1305     // inside the loop.
1306     if (UseMaskForGaps)
1307       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1308                                               CostKind);
1309 
1310     return Cost;
1311   }
1312 
1313   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1314   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1315                                         TTI::TargetCostKind CostKind) {
1316     // Check for generically free intrinsics.
1317     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1318       return 0;
1319 
1320     // Assume that target intrinsics are cheap.
1321     Intrinsic::ID IID = ICA.getID();
1322     if (Function::isTargetIntrinsic(IID))
1323       return TargetTransformInfo::TCC_Basic;
1324 
1325     if (ICA.isTypeBasedOnly())
1326       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1327 
1328     Type *RetTy = ICA.getReturnType();
1329 
1330     ElementCount RetVF =
1331         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1332                              : ElementCount::getFixed(1));
1333     const IntrinsicInst *I = ICA.getInst();
1334     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1335     FastMathFlags FMF = ICA.getFlags();
1336     switch (IID) {
1337     default:
1338       break;
1339 
1340     case Intrinsic::cttz:
1341       // FIXME: If necessary, this should go in target-specific overrides.
1342       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1343         return TargetTransformInfo::TCC_Basic;
1344       break;
1345 
1346     case Intrinsic::ctlz:
1347       // FIXME: If necessary, this should go in target-specific overrides.
1348       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1349         return TargetTransformInfo::TCC_Basic;
1350       break;
1351 
1352     case Intrinsic::memcpy:
1353       return thisT()->getMemcpyCost(ICA.getInst());
1354 
1355     case Intrinsic::masked_scatter: {
1356       const Value *Mask = Args[3];
1357       bool VarMask = !isa<Constant>(Mask);
1358       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1359       return thisT()->getGatherScatterOpCost(Instruction::Store,
1360                                              ICA.getArgTypes()[0], Args[1],
1361                                              VarMask, Alignment, CostKind, I);
1362     }
1363     case Intrinsic::masked_gather: {
1364       const Value *Mask = Args[2];
1365       bool VarMask = !isa<Constant>(Mask);
1366       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1367       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1368                                              VarMask, Alignment, CostKind, I);
1369     }
1370     case Intrinsic::experimental_stepvector: {
1371       if (isa<ScalableVectorType>(RetTy))
1372         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1373       // The cost of materialising a constant integer vector.
1374       return TargetTransformInfo::TCC_Basic;
1375     }
1376     case Intrinsic::experimental_vector_extract: {
1377       // FIXME: Handle case where a scalable vector is extracted from a scalable
1378       // vector
1379       if (isa<ScalableVectorType>(RetTy))
1380         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1381       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1382       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1383                                      cast<VectorType>(Args[0]->getType()), None,
1384                                      Index, cast<VectorType>(RetTy));
1385     }
1386     case Intrinsic::experimental_vector_insert: {
1387       // FIXME: Handle case where a scalable vector is inserted into a scalable
1388       // vector
1389       if (isa<ScalableVectorType>(Args[1]->getType()))
1390         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1391       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1392       return thisT()->getShuffleCost(
1393           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1394           Index, cast<VectorType>(Args[1]->getType()));
1395     }
1396     case Intrinsic::experimental_vector_reverse: {
1397       return thisT()->getShuffleCost(TTI::SK_Reverse,
1398                                      cast<VectorType>(Args[0]->getType()), None,
1399                                      0, cast<VectorType>(RetTy));
1400     }
1401     case Intrinsic::experimental_vector_splice: {
1402       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1403       return thisT()->getShuffleCost(TTI::SK_Splice,
1404                                      cast<VectorType>(Args[0]->getType()), None,
1405                                      Index, cast<VectorType>(RetTy));
1406     }
1407     case Intrinsic::vector_reduce_add:
1408     case Intrinsic::vector_reduce_mul:
1409     case Intrinsic::vector_reduce_and:
1410     case Intrinsic::vector_reduce_or:
1411     case Intrinsic::vector_reduce_xor:
1412     case Intrinsic::vector_reduce_smax:
1413     case Intrinsic::vector_reduce_smin:
1414     case Intrinsic::vector_reduce_fmax:
1415     case Intrinsic::vector_reduce_fmin:
1416     case Intrinsic::vector_reduce_umax:
1417     case Intrinsic::vector_reduce_umin: {
1418       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1419       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1420     }
1421     case Intrinsic::vector_reduce_fadd:
1422     case Intrinsic::vector_reduce_fmul: {
1423       IntrinsicCostAttributes Attrs(
1424           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1425       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1426     }
1427     case Intrinsic::fshl:
1428     case Intrinsic::fshr: {
1429       if (isa<ScalableVectorType>(RetTy))
1430         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1431       const Value *X = Args[0];
1432       const Value *Y = Args[1];
1433       const Value *Z = Args[2];
1434       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1435       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1436       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1437       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1438       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1439       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1440                                                               : TTI::OP_None;
1441       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1442       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1443       InstructionCost Cost = 0;
1444       Cost +=
1445           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1446       Cost +=
1447           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1448       Cost += thisT()->getArithmeticInstrCost(
1449           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1450       Cost += thisT()->getArithmeticInstrCost(
1451           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1452       // Non-constant shift amounts requires a modulo.
1453       if (OpKindZ != TTI::OK_UniformConstantValue &&
1454           OpKindZ != TTI::OK_NonUniformConstantValue)
1455         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1456                                                 CostKind, OpKindZ, OpKindBW,
1457                                                 OpPropsZ, OpPropsBW);
1458       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1459       if (X != Y) {
1460         Type *CondTy = RetTy->getWithNewBitWidth(1);
1461         Cost +=
1462             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1463                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1464         Cost +=
1465             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1466                                         CmpInst::BAD_ICMP_PREDICATE, CostKind);
1467       }
1468       return Cost;
1469     }
1470     }
1471 
1472     // Assume that we need to scalarize this intrinsic.
1473     // Compute the scalarization overhead based on Args for a vector
1474     // intrinsic.
1475     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1476     if (RetVF.isVector() && !RetVF.isScalable()) {
1477       ScalarizationCost = 0;
1478       if (!RetTy->isVoidTy())
1479         ScalarizationCost +=
1480             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1481       ScalarizationCost +=
1482           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1483     }
1484 
1485     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1486                                   ScalarizationCost);
1487     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1488   }
1489 
1490   /// Get intrinsic cost based on argument types.
1491   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1492   /// cost of scalarizing the arguments and the return value will be computed
1493   /// based on types.
1494   InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1495   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1496                                  TTI::TargetCostKind CostKind) {
1497     Intrinsic::ID IID = ICA.getID();
1498     Type *RetTy = ICA.getReturnType();
1499     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1500     FastMathFlags FMF = ICA.getFlags();
1501     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1502     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1503 
1504     VectorType *VecOpTy = nullptr;
1505     if (!Tys.empty()) {
1506       // The vector reduction operand is operand 0 except for fadd/fmul.
1507       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1508       unsigned VecTyIndex = 0;
1509       if (IID == Intrinsic::vector_reduce_fadd ||
1510           IID == Intrinsic::vector_reduce_fmul)
1511         VecTyIndex = 1;
1512       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1513       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1514     }
1515 
1516     // Library call cost - other than size, make it expensive.
1517     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1518     SmallVector<unsigned, 2> ISDs;
1519     switch (IID) {
1520     default: {
1521       // Scalable vectors cannot be scalarized, so return Invalid.
1522       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1523             return isa<ScalableVectorType>(Ty);
1524           }))
1525         return InstructionCost::getInvalid();
1526 
1527       // Assume that we need to scalarize this intrinsic.
1528       InstructionCost ScalarizationCost =
1529           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1530       unsigned ScalarCalls = 1;
1531       Type *ScalarRetTy = RetTy;
1532       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1533         if (!SkipScalarizationCost)
1534           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1535         ScalarCalls = std::max(ScalarCalls,
1536                                cast<FixedVectorType>(RetVTy)->getNumElements());
1537         ScalarRetTy = RetTy->getScalarType();
1538       }
1539       SmallVector<Type *, 4> ScalarTys;
1540       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1541         Type *Ty = Tys[i];
1542         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1543           if (!SkipScalarizationCost)
1544             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1545           ScalarCalls = std::max(ScalarCalls,
1546                                  cast<FixedVectorType>(VTy)->getNumElements());
1547           Ty = Ty->getScalarType();
1548         }
1549         ScalarTys.push_back(Ty);
1550       }
1551       if (ScalarCalls == 1)
1552         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1553 
1554       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1555       InstructionCost ScalarCost =
1556           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1557 
1558       return ScalarCalls * ScalarCost + ScalarizationCost;
1559     }
1560     // Look for intrinsics that can be lowered directly or turned into a scalar
1561     // intrinsic call.
1562     case Intrinsic::sqrt:
1563       ISDs.push_back(ISD::FSQRT);
1564       break;
1565     case Intrinsic::sin:
1566       ISDs.push_back(ISD::FSIN);
1567       break;
1568     case Intrinsic::cos:
1569       ISDs.push_back(ISD::FCOS);
1570       break;
1571     case Intrinsic::exp:
1572       ISDs.push_back(ISD::FEXP);
1573       break;
1574     case Intrinsic::exp2:
1575       ISDs.push_back(ISD::FEXP2);
1576       break;
1577     case Intrinsic::log:
1578       ISDs.push_back(ISD::FLOG);
1579       break;
1580     case Intrinsic::log10:
1581       ISDs.push_back(ISD::FLOG10);
1582       break;
1583     case Intrinsic::log2:
1584       ISDs.push_back(ISD::FLOG2);
1585       break;
1586     case Intrinsic::fabs:
1587       ISDs.push_back(ISD::FABS);
1588       break;
1589     case Intrinsic::canonicalize:
1590       ISDs.push_back(ISD::FCANONICALIZE);
1591       break;
1592     case Intrinsic::minnum:
1593       ISDs.push_back(ISD::FMINNUM);
1594       break;
1595     case Intrinsic::maxnum:
1596       ISDs.push_back(ISD::FMAXNUM);
1597       break;
1598     case Intrinsic::minimum:
1599       ISDs.push_back(ISD::FMINIMUM);
1600       break;
1601     case Intrinsic::maximum:
1602       ISDs.push_back(ISD::FMAXIMUM);
1603       break;
1604     case Intrinsic::copysign:
1605       ISDs.push_back(ISD::FCOPYSIGN);
1606       break;
1607     case Intrinsic::floor:
1608       ISDs.push_back(ISD::FFLOOR);
1609       break;
1610     case Intrinsic::ceil:
1611       ISDs.push_back(ISD::FCEIL);
1612       break;
1613     case Intrinsic::trunc:
1614       ISDs.push_back(ISD::FTRUNC);
1615       break;
1616     case Intrinsic::nearbyint:
1617       ISDs.push_back(ISD::FNEARBYINT);
1618       break;
1619     case Intrinsic::rint:
1620       ISDs.push_back(ISD::FRINT);
1621       break;
1622     case Intrinsic::round:
1623       ISDs.push_back(ISD::FROUND);
1624       break;
1625     case Intrinsic::roundeven:
1626       ISDs.push_back(ISD::FROUNDEVEN);
1627       break;
1628     case Intrinsic::pow:
1629       ISDs.push_back(ISD::FPOW);
1630       break;
1631     case Intrinsic::fma:
1632       ISDs.push_back(ISD::FMA);
1633       break;
1634     case Intrinsic::fmuladd:
1635       ISDs.push_back(ISD::FMA);
1636       break;
1637     case Intrinsic::experimental_constrained_fmuladd:
1638       ISDs.push_back(ISD::STRICT_FMA);
1639       break;
1640     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1641     case Intrinsic::lifetime_start:
1642     case Intrinsic::lifetime_end:
1643     case Intrinsic::sideeffect:
1644     case Intrinsic::pseudoprobe:
1645     case Intrinsic::arithmetic_fence:
1646       return 0;
1647     case Intrinsic::masked_store: {
1648       Type *Ty = Tys[0];
1649       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1650       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1651                                             CostKind);
1652     }
1653     case Intrinsic::masked_load: {
1654       Type *Ty = RetTy;
1655       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1656       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1657                                             CostKind);
1658     }
1659     case Intrinsic::vector_reduce_add:
1660       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1661                                                  None, CostKind);
1662     case Intrinsic::vector_reduce_mul:
1663       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1664                                                  None, CostKind);
1665     case Intrinsic::vector_reduce_and:
1666       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1667                                                  None, CostKind);
1668     case Intrinsic::vector_reduce_or:
1669       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1670                                                  CostKind);
1671     case Intrinsic::vector_reduce_xor:
1672       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1673                                                  None, CostKind);
1674     case Intrinsic::vector_reduce_fadd:
1675       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1676                                                  FMF, CostKind);
1677     case Intrinsic::vector_reduce_fmul:
1678       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1679                                                  FMF, CostKind);
1680     case Intrinsic::vector_reduce_smax:
1681     case Intrinsic::vector_reduce_smin:
1682     case Intrinsic::vector_reduce_fmax:
1683     case Intrinsic::vector_reduce_fmin:
1684       return thisT()->getMinMaxReductionCost(
1685           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1686           /*IsUnsigned=*/false, CostKind);
1687     case Intrinsic::vector_reduce_umax:
1688     case Intrinsic::vector_reduce_umin:
1689       return thisT()->getMinMaxReductionCost(
1690           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1691           /*IsUnsigned=*/true, CostKind);
1692     case Intrinsic::abs:
1693     case Intrinsic::smax:
1694     case Intrinsic::smin:
1695     case Intrinsic::umax:
1696     case Intrinsic::umin: {
1697       // abs(X) = select(icmp(X,0),X,sub(0,X))
1698       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1699       Type *CondTy = RetTy->getWithNewBitWidth(1);
1700       InstructionCost Cost = 0;
1701       // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code.
1702       Cost +=
1703           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1704                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1705       Cost +=
1706           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1707                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1708       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1709       if (IID == Intrinsic::abs)
1710         Cost += thisT()->getArithmeticInstrCost(
1711             BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1712       return Cost;
1713     }
1714     case Intrinsic::sadd_sat:
1715     case Intrinsic::ssub_sat: {
1716       Type *CondTy = RetTy->getWithNewBitWidth(1);
1717 
1718       Type *OpTy = StructType::create({RetTy, CondTy});
1719       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1720                                      ? Intrinsic::sadd_with_overflow
1721                                      : Intrinsic::ssub_with_overflow;
1722 
1723       // SatMax -> Overflow && SumDiff < 0
1724       // SatMin -> Overflow && SumDiff >= 0
1725       InstructionCost Cost = 0;
1726       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1727                                     nullptr, ScalarizationCostPassed);
1728       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1729       Cost +=
1730           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1731                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1732       Cost += 2 * thisT()->getCmpSelInstrCost(
1733                       BinaryOperator::Select, RetTy, CondTy,
1734                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1735       return Cost;
1736     }
1737     case Intrinsic::uadd_sat:
1738     case Intrinsic::usub_sat: {
1739       Type *CondTy = RetTy->getWithNewBitWidth(1);
1740 
1741       Type *OpTy = StructType::create({RetTy, CondTy});
1742       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1743                                      ? Intrinsic::uadd_with_overflow
1744                                      : Intrinsic::usub_with_overflow;
1745 
1746       InstructionCost Cost = 0;
1747       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1748                                     nullptr, ScalarizationCostPassed);
1749       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1750       Cost +=
1751           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1752                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1753       return Cost;
1754     }
1755     case Intrinsic::smul_fix:
1756     case Intrinsic::umul_fix: {
1757       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1758       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1759 
1760       unsigned ExtOp =
1761           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1762       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1763 
1764       InstructionCost Cost = 0;
1765       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1766       Cost +=
1767           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1768       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1769                                             CCH, CostKind);
1770       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1771                                               CostKind, TTI::OK_AnyValue,
1772                                               TTI::OK_UniformConstantValue);
1773       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1774                                               TTI::OK_AnyValue,
1775                                               TTI::OK_UniformConstantValue);
1776       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1777       return Cost;
1778     }
1779     case Intrinsic::sadd_with_overflow:
1780     case Intrinsic::ssub_with_overflow: {
1781       Type *SumTy = RetTy->getContainedType(0);
1782       Type *OverflowTy = RetTy->getContainedType(1);
1783       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1784                             ? BinaryOperator::Add
1785                             : BinaryOperator::Sub;
1786 
1787       //   LHSSign -> LHS >= 0
1788       //   RHSSign -> RHS >= 0
1789       //   SumSign -> Sum >= 0
1790       //
1791       //   Add:
1792       //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
1793       //   Sub:
1794       //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
1795       InstructionCost Cost = 0;
1796       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1797       Cost += 3 * thisT()->getCmpSelInstrCost(
1798                       Instruction::ICmp, SumTy, OverflowTy,
1799                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1800       Cost += 2 * thisT()->getCmpSelInstrCost(
1801                       Instruction::Select, OverflowTy, OverflowTy,
1802                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1803       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy,
1804                                               CostKind);
1805       return Cost;
1806     }
1807     case Intrinsic::uadd_with_overflow:
1808     case Intrinsic::usub_with_overflow: {
1809       Type *SumTy = RetTy->getContainedType(0);
1810       Type *OverflowTy = RetTy->getContainedType(1);
1811       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1812                             ? BinaryOperator::Add
1813                             : BinaryOperator::Sub;
1814 
1815       InstructionCost Cost = 0;
1816       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1817       Cost +=
1818           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1819                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1820       return Cost;
1821     }
1822     case Intrinsic::smul_with_overflow:
1823     case Intrinsic::umul_with_overflow: {
1824       Type *MulTy = RetTy->getContainedType(0);
1825       Type *OverflowTy = RetTy->getContainedType(1);
1826       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1827       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1828 
1829       unsigned ExtOp =
1830           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1831       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1832 
1833       InstructionCost Cost = 0;
1834       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1835       Cost +=
1836           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1837       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1838                                             CCH, CostKind);
1839       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy,
1840                                               CostKind, TTI::OK_AnyValue,
1841                                               TTI::OK_UniformConstantValue);
1842 
1843       if (IID == Intrinsic::smul_with_overflow)
1844         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1845                                                 CostKind, TTI::OK_AnyValue,
1846                                                 TTI::OK_UniformConstantValue);
1847 
1848       Cost +=
1849           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy,
1850                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1851       return Cost;
1852     }
1853     case Intrinsic::ctpop:
1854       ISDs.push_back(ISD::CTPOP);
1855       // In case of legalization use TCC_Expensive. This is cheaper than a
1856       // library call but still not a cheap instruction.
1857       SingleCallCost = TargetTransformInfo::TCC_Expensive;
1858       break;
1859     case Intrinsic::ctlz:
1860       ISDs.push_back(ISD::CTLZ);
1861       break;
1862     case Intrinsic::cttz:
1863       ISDs.push_back(ISD::CTTZ);
1864       break;
1865     case Intrinsic::bswap:
1866       ISDs.push_back(ISD::BSWAP);
1867       break;
1868     case Intrinsic::bitreverse:
1869       ISDs.push_back(ISD::BITREVERSE);
1870       break;
1871     }
1872 
1873     const TargetLoweringBase *TLI = getTLI();
1874     std::pair<InstructionCost, MVT> LT =
1875         TLI->getTypeLegalizationCost(DL, RetTy);
1876 
1877     SmallVector<InstructionCost, 2> LegalCost;
1878     SmallVector<InstructionCost, 2> CustomCost;
1879     for (unsigned ISD : ISDs) {
1880       if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1881         if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1882             TLI->isFAbsFree(LT.second)) {
1883           return 0;
1884         }
1885 
1886         // The operation is legal. Assume it costs 1.
1887         // If the type is split to multiple registers, assume that there is some
1888         // overhead to this.
1889         // TODO: Once we have extract/insert subvector cost we need to use them.
1890         if (LT.first > 1)
1891           LegalCost.push_back(LT.first * 2);
1892         else
1893           LegalCost.push_back(LT.first * 1);
1894       } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1895         // If the operation is custom lowered then assume
1896         // that the code is twice as expensive.
1897         CustomCost.push_back(LT.first * 2);
1898       }
1899     }
1900 
1901     auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1902     if (MinLegalCostI != LegalCost.end())
1903       return *MinLegalCostI;
1904 
1905     auto MinCustomCostI =
1906         std::min_element(CustomCost.begin(), CustomCost.end());
1907     if (MinCustomCostI != CustomCost.end())
1908       return *MinCustomCostI;
1909 
1910     // If we can't lower fmuladd into an FMA estimate the cost as a floating
1911     // point mul followed by an add.
1912     if (IID == Intrinsic::fmuladd)
1913       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1914                                              CostKind) +
1915              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1916                                              CostKind);
1917     if (IID == Intrinsic::experimental_constrained_fmuladd) {
1918       IntrinsicCostAttributes FMulAttrs(
1919         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1920       IntrinsicCostAttributes FAddAttrs(
1921         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1922       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1923              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1924     }
1925 
1926     // Else, assume that we need to scalarize this intrinsic. For math builtins
1927     // this will emit a costly libcall, adding call overhead and spills. Make it
1928     // very expensive.
1929     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1930       // Scalable vectors cannot be scalarized, so return Invalid.
1931       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1932             return isa<ScalableVectorType>(Ty);
1933           }))
1934         return InstructionCost::getInvalid();
1935 
1936       InstructionCost ScalarizationCost =
1937           SkipScalarizationCost ? ScalarizationCostPassed
1938                                 : getScalarizationOverhead(RetVTy, true, false);
1939 
1940       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1941       SmallVector<Type *, 4> ScalarTys;
1942       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1943         Type *Ty = Tys[i];
1944         if (Ty->isVectorTy())
1945           Ty = Ty->getScalarType();
1946         ScalarTys.push_back(Ty);
1947       }
1948       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1949       InstructionCost ScalarCost =
1950           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1951       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1952         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1953           if (!ICA.skipScalarizationCost())
1954             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1955           ScalarCalls = std::max(ScalarCalls,
1956                                  cast<FixedVectorType>(VTy)->getNumElements());
1957         }
1958       }
1959       return ScalarCalls * ScalarCost + ScalarizationCost;
1960     }
1961 
1962     // This is going to be turned into a library call, make it expensive.
1963     return SingleCallCost;
1964   }
1965 
1966   /// Compute a cost of the given call instruction.
1967   ///
1968   /// Compute the cost of calling function F with return type RetTy and
1969   /// argument types Tys. F might be nullptr, in this case the cost of an
1970   /// arbitrary call with the specified signature will be returned.
1971   /// This is used, for instance,  when we estimate call of a vector
1972   /// counterpart of the given function.
1973   /// \param F Called function, might be nullptr.
1974   /// \param RetTy Return value types.
1975   /// \param Tys Argument types.
1976   /// \returns The cost of Call instruction.
1977   InstructionCost
1978   getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys,
1979                    TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) {
1980     return 10;
1981   }
1982 
getNumberOfParts(Type * Tp)1983   unsigned getNumberOfParts(Type *Tp) {
1984     std::pair<InstructionCost, MVT> LT =
1985         getTLI()->getTypeLegalizationCost(DL, Tp);
1986     return *LT.first.getValue();
1987   }
1988 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1989   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1990                                             const SCEV *) {
1991     return 0;
1992   }
1993 
1994   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
1995   /// We're assuming that reduction operation are performing the following way:
1996   ///
1997   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
1998   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
1999   ///            \----------------v-------------/  \----------v------------/
2000   ///                            n/2 elements               n/2 elements
2001   /// %red1 = op <n x t> %val, <n x t> val1
2002   /// After this operation we have a vector %red1 where only the first n/2
2003   /// elements are meaningful, the second n/2 elements are undefined and can be
2004   /// dropped. All other operations are actually working with the vector of
2005   /// length n/2, not n, though the real vector length is still n.
2006   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2007   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2008   ///            \----------------v-------------/  \----------v------------/
2009   ///                            n/4 elements               3*n/4 elements
2010   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2011   /// length n/2, the resulting vector has length n/4 etc.
2012   ///
2013   /// The cost model should take into account that the actual length of the
2014   /// vector is reduced on each iteration.
getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2015   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2016                                        TTI::TargetCostKind CostKind) {
2017     Type *ScalarTy = Ty->getElementType();
2018     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2019     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2020         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2021         NumVecElts >= 2) {
2022       // Or reduction for i1 is represented as:
2023       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2024       // %res = cmp ne iReduxWidth %val, 0
2025       // And reduction for i1 is represented as:
2026       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2027       // %res = cmp eq iReduxWidth %val, 11111
2028       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2029       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2030                                        TTI::CastContextHint::None, CostKind) +
2031              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2032                                          CmpInst::makeCmpResultType(ValTy),
2033                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2034     }
2035     unsigned NumReduxLevels = Log2_32(NumVecElts);
2036     InstructionCost ArithCost = 0;
2037     InstructionCost ShuffleCost = 0;
2038     std::pair<InstructionCost, MVT> LT =
2039         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2040     unsigned LongVectorCount = 0;
2041     unsigned MVTLen =
2042         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2043     while (NumVecElts > MVTLen) {
2044       NumVecElts /= 2;
2045       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2046       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2047                                              NumVecElts, SubTy);
2048       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2049       Ty = SubTy;
2050       ++LongVectorCount;
2051     }
2052 
2053     NumReduxLevels -= LongVectorCount;
2054 
2055     // The minimal length of the vector is limited by the real length of vector
2056     // operations performed on the current platform. That's why several final
2057     // reduction operations are performed on the vectors with the same
2058     // architecture-dependent length.
2059 
2060     // By default reductions need one shuffle per reduction level.
2061     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2062                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2063     ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty);
2064     return ShuffleCost + ArithCost +
2065            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2066   }
2067 
2068   /// Try to calculate the cost of performing strict (in-order) reductions,
2069   /// which involves doing a sequence of floating point additions in lane
2070   /// order, starting with an initial value. For example, consider a scalar
2071   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2072   ///
2073   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2074   ///
2075   ///   %add1 = %InitVal + %v0
2076   ///   %add2 = %add1 + %v1
2077   ///   %add3 = %add2 + %v2
2078   ///   %add4 = %add3 + %v3
2079   ///
2080   /// As a simple estimate we can say the cost of such a reduction is 4 times
2081   /// the cost of a scalar FP addition. We can only estimate the costs for
2082   /// fixed-width vectors here because for scalable vectors we do not know the
2083   /// runtime number of operations.
getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2084   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2085                                           TTI::TargetCostKind CostKind) {
2086     // Targets must implement a default value for the scalable case, since
2087     // we don't know how many lanes the vector has.
2088     if (isa<ScalableVectorType>(Ty))
2089       return InstructionCost::getInvalid();
2090 
2091     auto *VTy = cast<FixedVectorType>(Ty);
2092     InstructionCost ExtractCost =
2093         getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2094     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2095         Opcode, VTy->getElementType(), CostKind);
2096     ArithCost *= VTy->getNumElements();
2097 
2098     return ExtractCost + ArithCost;
2099   }
2100 
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2101   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2102                                              Optional<FastMathFlags> FMF,
2103                                              TTI::TargetCostKind CostKind) {
2104     if (TTI::requiresOrderedReduction(FMF))
2105       return getOrderedReductionCost(Opcode, Ty, CostKind);
2106     return getTreeReductionCost(Opcode, Ty, CostKind);
2107   }
2108 
2109   /// Try to calculate op costs for min/max reduction operations.
2110   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)2111   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2112                                          bool IsUnsigned,
2113                                          TTI::TargetCostKind CostKind) {
2114     Type *ScalarTy = Ty->getElementType();
2115     Type *ScalarCondTy = CondTy->getElementType();
2116     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2117     unsigned NumReduxLevels = Log2_32(NumVecElts);
2118     unsigned CmpOpcode;
2119     if (Ty->isFPOrFPVectorTy()) {
2120       CmpOpcode = Instruction::FCmp;
2121     } else {
2122       assert(Ty->isIntOrIntVectorTy() &&
2123              "expecting floating point or integer type for min/max reduction");
2124       CmpOpcode = Instruction::ICmp;
2125     }
2126     InstructionCost MinMaxCost = 0;
2127     InstructionCost ShuffleCost = 0;
2128     std::pair<InstructionCost, MVT> LT =
2129         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2130     unsigned LongVectorCount = 0;
2131     unsigned MVTLen =
2132         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2133     while (NumVecElts > MVTLen) {
2134       NumVecElts /= 2;
2135       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2136       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2137 
2138       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2139                                              NumVecElts, SubTy);
2140       MinMaxCost +=
2141           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2142                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2143           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2144                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2145       Ty = SubTy;
2146       ++LongVectorCount;
2147     }
2148 
2149     NumReduxLevels -= LongVectorCount;
2150 
2151     // The minimal length of the vector is limited by the real length of vector
2152     // operations performed on the current platform. That's why several final
2153     // reduction opertions are perfomed on the vectors with the same
2154     // architecture-dependent length.
2155     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2156                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2157     MinMaxCost +=
2158         NumReduxLevels *
2159         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2160                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2161          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2162                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2163     // The last min/max should be in vector registers and we counted it above.
2164     // So just need a single extractelement.
2165     return ShuffleCost + MinMaxCost +
2166            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2167   }
2168 
getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2169   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2170                                               Type *ResTy, VectorType *Ty,
2171                                               TTI::TargetCostKind CostKind) {
2172     // Without any native support, this is equivalent to the cost of
2173     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2174     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2175     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2176         Instruction::Add, ExtTy, None, CostKind);
2177     InstructionCost MulCost = 0;
2178     InstructionCost ExtCost = thisT()->getCastInstrCost(
2179         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2180         TTI::CastContextHint::None, CostKind);
2181     if (IsMLA) {
2182       MulCost =
2183           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2184       ExtCost *= 2;
2185     }
2186 
2187     return RedCost + MulCost + ExtCost;
2188   }
2189 
getVectorSplitCost()2190   InstructionCost getVectorSplitCost() { return 1; }
2191 
2192   /// @}
2193 };
2194 
2195 /// Concrete BasicTTIImpl that can be used if no further customization
2196 /// is needed.
2197 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2198   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2199 
2200   friend class BasicTTIImplBase<BasicTTIImpl>;
2201 
2202   const TargetSubtargetInfo *ST;
2203   const TargetLoweringBase *TLI;
2204 
getST()2205   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2206   const TargetLoweringBase *getTLI() const { return TLI; }
2207 
2208 public:
2209   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2210 };
2211 
2212 } // end namespace llvm
2213 
2214 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2215