1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/Constant.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MachineValueType.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include <algorithm>
51 #include <cassert>
52 #include <cstdint>
53 #include <limits>
54 #include <utility>
55 
56 namespace llvm {
57 
58 class Function;
59 class GlobalValue;
60 class LLVMContext;
61 class ScalarEvolution;
62 class SCEV;
63 class TargetMachine;
64 
65 extern cl::opt<unsigned> PartialUnrollingThreshold;
66 
67 /// Base class which can be used to help build a TTI implementation.
68 ///
69 /// This class provides as much implementation of the TTI interface as is
70 /// possible using the target independent parts of the code generator.
71 ///
72 /// In order to subclass it, your class must implement a getST() method to
73 /// return the subtarget, and a getTLI() method to return the target lowering.
74 /// We need these methods implemented in the derived class so that this class
75 /// doesn't have to duplicate storage for them.
76 template <typename T>
77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78 private:
79   using BaseT = TargetTransformInfoImplCRTPBase<T>;
80   using TTI = TargetTransformInfo;
81 
82   /// Helper function to access this as a T.
thisT()83   T *thisT() { return static_cast<T *>(this); }
84 
85   /// Estimate a cost of Broadcast as an extract and sequence of insert
86   /// operations.
getBroadcastShuffleOverhead(FixedVectorType * VTy)87   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
88     InstructionCost Cost = 0;
89     // Broadcast cost is equal to the cost of extracting the zero'th element
90     // plus the cost of inserting it into every element of the result vector.
91     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
92 
93     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
94       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
95     }
96     return Cost;
97   }
98 
99   /// Estimate a cost of shuffle as a sequence of extract and insert
100   /// operations.
getPermuteShuffleOverhead(FixedVectorType * VTy)101   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
102     InstructionCost Cost = 0;
103     // Shuffle cost is equal to the cost of extracting element from its argument
104     // plus the cost of inserting them onto the result vector.
105 
106     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
107     // index 0 of first vector, index 1 of second vector,index 2 of first
108     // vector and finally index 3 of second vector and insert them at index
109     // <0,1,2,3> of result vector.
110     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
111       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
112       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
113     }
114     return Cost;
115   }
116 
117   /// Estimate a cost of subvector extraction as a sequence of extract and
118   /// insert operations.
getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)119   InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
120                                        FixedVectorType *SubVTy) {
121     assert(VTy && SubVTy &&
122            "Can only extract subvectors from vectors");
123     int NumSubElts = SubVTy->getNumElements();
124     assert((!isa<FixedVectorType>(VTy) ||
125             (Index + NumSubElts) <=
126                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
127            "SK_ExtractSubvector index out of range");
128 
129     InstructionCost Cost = 0;
130     // Subvector extraction cost is equal to the cost of extracting element from
131     // the source type plus the cost of inserting them into the result vector
132     // type.
133     for (int i = 0; i != NumSubElts; ++i) {
134       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
135                                           i + Index);
136       Cost +=
137           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
138     }
139     return Cost;
140   }
141 
142   /// Estimate a cost of subvector insertion as a sequence of extract and
143   /// insert operations.
getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)144   InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
145                                       FixedVectorType *SubVTy) {
146     assert(VTy && SubVTy &&
147            "Can only insert subvectors into vectors");
148     int NumSubElts = SubVTy->getNumElements();
149     assert((!isa<FixedVectorType>(VTy) ||
150             (Index + NumSubElts) <=
151                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
152            "SK_InsertSubvector index out of range");
153 
154     InstructionCost Cost = 0;
155     // Subvector insertion cost is equal to the cost of extracting element from
156     // the source type plus the cost of inserting them into the result vector
157     // type.
158     for (int i = 0; i != NumSubElts; ++i) {
159       Cost +=
160           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
161       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
162                                           i + Index);
163     }
164     return Cost;
165   }
166 
167   /// Local query method delegates up to T which *must* implement this!
getST()168   const TargetSubtargetInfo *getST() const {
169     return static_cast<const T *>(this)->getST();
170   }
171 
172   /// Local query method delegates up to T which *must* implement this!
getTLI()173   const TargetLoweringBase *getTLI() const {
174     return static_cast<const T *>(this)->getTLI();
175   }
176 
getISDIndexedMode(TTI::MemIndexedMode M)177   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
178     switch (M) {
179       case TTI::MIM_Unindexed:
180         return ISD::UNINDEXED;
181       case TTI::MIM_PreInc:
182         return ISD::PRE_INC;
183       case TTI::MIM_PreDec:
184         return ISD::PRE_DEC;
185       case TTI::MIM_PostInc:
186         return ISD::POST_INC;
187       case TTI::MIM_PostDec:
188         return ISD::POST_DEC;
189     }
190     llvm_unreachable("Unexpected MemIndexedMode");
191   }
192 
getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)193   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
194                                               Align Alignment,
195                                               bool VariableMask,
196                                               bool IsGatherScatter,
197                                               TTI::TargetCostKind CostKind) {
198     auto *VT = cast<FixedVectorType>(DataTy);
199     // Assume the target does not have support for gather/scatter operations
200     // and provide a rough estimate.
201     //
202     // First, compute the cost of the individual memory operations.
203     InstructionCost AddrExtractCost =
204         IsGatherScatter
205             ? getVectorInstrCost(Instruction::ExtractElement,
206                                  FixedVectorType::get(
207                                      PointerType::get(VT->getElementType(), 0),
208                                      VT->getNumElements()),
209                                  -1)
210             : 0;
211     InstructionCost LoadCost =
212         VT->getNumElements() *
213         (AddrExtractCost +
214          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
215 
216     // Next, compute the cost of packing the result in a vector.
217     InstructionCost PackingCost = getScalarizationOverhead(
218         VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
219 
220     InstructionCost ConditionalCost = 0;
221     if (VariableMask) {
222       // Compute the cost of conditionally executing the memory operations with
223       // variable masks. This includes extracting the individual conditions, a
224       // branches and PHIs to combine the results.
225       // NOTE: Estimating the cost of conditionally executing the memory
226       // operations accurately is quite difficult and the current solution
227       // provides a very rough estimate only.
228       ConditionalCost =
229           VT->getNumElements() *
230           (getVectorInstrCost(
231                Instruction::ExtractElement,
232                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
233                                     VT->getNumElements()),
234                -1) +
235            getCFInstrCost(Instruction::Br, CostKind) +
236            getCFInstrCost(Instruction::PHI, CostKind));
237     }
238 
239     return LoadCost + PackingCost + ConditionalCost;
240   }
241 
242 protected:
BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)243   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
244       : BaseT(DL) {}
245   virtual ~BasicTTIImplBase() = default;
246 
247   using TargetTransformInfoImplBase::DL;
248 
249 public:
250   /// \name Scalar TTI Implementations
251   /// @{
allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,bool * Fast)252   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
253                                       unsigned AddressSpace, Align Alignment,
254                                       bool *Fast) const {
255     EVT E = EVT::getIntegerVT(Context, BitWidth);
256     return getTLI()->allowsMisalignedMemoryAccesses(
257         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
258   }
259 
hasBranchDivergence()260   bool hasBranchDivergence() { return false; }
261 
useGPUDivergenceAnalysis()262   bool useGPUDivergenceAnalysis() { return false; }
263 
isSourceOfDivergence(const Value * V)264   bool isSourceOfDivergence(const Value *V) { return false; }
265 
isAlwaysUniform(const Value * V)266   bool isAlwaysUniform(const Value *V) { return false; }
267 
getFlatAddressSpace()268   unsigned getFlatAddressSpace() {
269     // Return an invalid address space.
270     return -1;
271   }
272 
collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)273   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
274                                   Intrinsic::ID IID) const {
275     return false;
276   }
277 
isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)278   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
279     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
280   }
281 
getAssumedAddrSpace(const Value * V)282   unsigned getAssumedAddrSpace(const Value *V) const {
283     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
284   }
285 
rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)286   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
287                                           Value *NewV) const {
288     return nullptr;
289   }
290 
isLegalAddImmediate(int64_t imm)291   bool isLegalAddImmediate(int64_t imm) {
292     return getTLI()->isLegalAddImmediate(imm);
293   }
294 
isLegalICmpImmediate(int64_t imm)295   bool isLegalICmpImmediate(int64_t imm) {
296     return getTLI()->isLegalICmpImmediate(imm);
297   }
298 
299   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
300                              bool HasBaseReg, int64_t Scale,
301                              unsigned AddrSpace, Instruction *I = nullptr) {
302     TargetLoweringBase::AddrMode AM;
303     AM.BaseGV = BaseGV;
304     AM.BaseOffs = BaseOffset;
305     AM.HasBaseReg = HasBaseReg;
306     AM.Scale = Scale;
307     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
308   }
309 
isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)310   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
311                           const DataLayout &DL) const {
312     EVT VT = getTLI()->getValueType(DL, Ty);
313     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
314   }
315 
isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)316   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
317                            const DataLayout &DL) const {
318     EVT VT = getTLI()->getValueType(DL, Ty);
319     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
320   }
321 
isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)322   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
323     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
324   }
325 
isNumRegsMajorCostOfLSR()326   bool isNumRegsMajorCostOfLSR() {
327     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
328   }
329 
isProfitableLSRChainElement(Instruction * I)330   bool isProfitableLSRChainElement(Instruction *I) {
331     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
332   }
333 
getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)334   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
335                                        int64_t BaseOffset, bool HasBaseReg,
336                                        int64_t Scale, unsigned AddrSpace) {
337     TargetLoweringBase::AddrMode AM;
338     AM.BaseGV = BaseGV;
339     AM.BaseOffs = BaseOffset;
340     AM.HasBaseReg = HasBaseReg;
341     AM.Scale = Scale;
342     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
343   }
344 
isTruncateFree(Type * Ty1,Type * Ty2)345   bool isTruncateFree(Type *Ty1, Type *Ty2) {
346     return getTLI()->isTruncateFree(Ty1, Ty2);
347   }
348 
isProfitableToHoist(Instruction * I)349   bool isProfitableToHoist(Instruction *I) {
350     return getTLI()->isProfitableToHoist(I);
351   }
352 
useAA()353   bool useAA() const { return getST()->useAA(); }
354 
isTypeLegal(Type * Ty)355   bool isTypeLegal(Type *Ty) {
356     EVT VT = getTLI()->getValueType(DL, Ty);
357     return getTLI()->isTypeLegal(VT);
358   }
359 
getRegUsageForType(Type * Ty)360   InstructionCost getRegUsageForType(Type *Ty) {
361     InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first;
362     assert(Val >= 0 && "Negative cost!");
363     return Val;
364   }
365 
getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,TTI::TargetCostKind CostKind)366   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
367                              ArrayRef<const Value *> Operands,
368                              TTI::TargetCostKind CostKind) {
369     return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
370   }
371 
getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)372   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
373                                             unsigned &JumpTableSize,
374                                             ProfileSummaryInfo *PSI,
375                                             BlockFrequencyInfo *BFI) {
376     /// Try to find the estimated number of clusters. Note that the number of
377     /// clusters identified in this function could be different from the actual
378     /// numbers found in lowering. This function ignore switches that are
379     /// lowered with a mix of jump table / bit test / BTree. This function was
380     /// initially intended to be used when estimating the cost of switch in
381     /// inline cost heuristic, but it's a generic cost model to be used in other
382     /// places (e.g., in loop unrolling).
383     unsigned N = SI.getNumCases();
384     const TargetLoweringBase *TLI = getTLI();
385     const DataLayout &DL = this->getDataLayout();
386 
387     JumpTableSize = 0;
388     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
389 
390     // Early exit if both a jump table and bit test are not allowed.
391     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
392       return N;
393 
394     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
395     APInt MinCaseVal = MaxCaseVal;
396     for (auto CI : SI.cases()) {
397       const APInt &CaseVal = CI.getCaseValue()->getValue();
398       if (CaseVal.sgt(MaxCaseVal))
399         MaxCaseVal = CaseVal;
400       if (CaseVal.slt(MinCaseVal))
401         MinCaseVal = CaseVal;
402     }
403 
404     // Check if suitable for a bit test
405     if (N <= DL.getIndexSizeInBits(0u)) {
406       SmallPtrSet<const BasicBlock *, 4> Dests;
407       for (auto I : SI.cases())
408         Dests.insert(I.getCaseSuccessor());
409 
410       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
411                                      DL))
412         return 1;
413     }
414 
415     // Check if suitable for a jump table.
416     if (IsJTAllowed) {
417       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
418         return N;
419       uint64_t Range =
420           (MaxCaseVal - MinCaseVal)
421               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
422       // Check whether a range of clusters is dense enough for a jump table
423       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
424         JumpTableSize = Range;
425         return 1;
426       }
427     }
428     return N;
429   }
430 
shouldBuildLookupTables()431   bool shouldBuildLookupTables() {
432     const TargetLoweringBase *TLI = getTLI();
433     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
434            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
435   }
436 
shouldBuildRelLookupTables()437   bool shouldBuildRelLookupTables() const {
438     const TargetMachine &TM = getTLI()->getTargetMachine();
439     // If non-PIC mode, do not generate a relative lookup table.
440     if (!TM.isPositionIndependent())
441       return false;
442 
443     /// Relative lookup table entries consist of 32-bit offsets.
444     /// Do not generate relative lookup tables for large code models
445     /// in 64-bit achitectures where 32-bit offsets might not be enough.
446     if (TM.getCodeModel() == CodeModel::Medium ||
447         TM.getCodeModel() == CodeModel::Large)
448       return false;
449 
450     Triple TargetTriple = TM.getTargetTriple();
451     if (!TargetTriple.isArch64Bit())
452       return false;
453 
454     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
455     // there.
456     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
457       return false;
458 
459     return true;
460   }
461 
haveFastSqrt(Type * Ty)462   bool haveFastSqrt(Type *Ty) {
463     const TargetLoweringBase *TLI = getTLI();
464     EVT VT = TLI->getValueType(DL, Ty);
465     return TLI->isTypeLegal(VT) &&
466            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
467   }
468 
isFCmpOrdCheaperThanFCmpZero(Type * Ty)469   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
470     return true;
471   }
472 
getFPOpCost(Type * Ty)473   InstructionCost getFPOpCost(Type *Ty) {
474     // Check whether FADD is available, as a proxy for floating-point in
475     // general.
476     const TargetLoweringBase *TLI = getTLI();
477     EVT VT = TLI->getValueType(DL, Ty);
478     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
479       return TargetTransformInfo::TCC_Basic;
480     return TargetTransformInfo::TCC_Expensive;
481   }
482 
getInliningThresholdMultiplier()483   unsigned getInliningThresholdMultiplier() { return 1; }
adjustInliningThreshold(const CallBase * CB)484   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
485 
getInlinerVectorBonusPercent()486   int getInlinerVectorBonusPercent() { return 150; }
487 
getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)488   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
489                                TTI::UnrollingPreferences &UP,
490                                OptimizationRemarkEmitter *ORE) {
491     // This unrolling functionality is target independent, but to provide some
492     // motivation for its intended use, for x86:
493 
494     // According to the Intel 64 and IA-32 Architectures Optimization Reference
495     // Manual, Intel Core models and later have a loop stream detector (and
496     // associated uop queue) that can benefit from partial unrolling.
497     // The relevant requirements are:
498     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
499     //    taken, and none of them may be calls.
500     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
501 
502     // According to the Software Optimization Guide for AMD Family 15h
503     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
504     // and loop buffer which can benefit from partial unrolling.
505     // The relevant requirements are:
506     //  - The loop must have fewer than 16 branches
507     //  - The loop must have less than 40 uops in all executed loop branches
508 
509     // The number of taken branches in a loop is hard to estimate here, and
510     // benchmarking has revealed that it is better not to be conservative when
511     // estimating the branch count. As a result, we'll ignore the branch limits
512     // until someone finds a case where it matters in practice.
513 
514     unsigned MaxOps;
515     const TargetSubtargetInfo *ST = getST();
516     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
517       MaxOps = PartialUnrollingThreshold;
518     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
519       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
520     else
521       return;
522 
523     // Scan the loop: don't unroll loops with calls.
524     for (BasicBlock *BB : L->blocks()) {
525       for (Instruction &I : *BB) {
526         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
527           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
528             if (!thisT()->isLoweredToCall(F))
529               continue;
530           }
531 
532           if (ORE) {
533             ORE->emit([&]() {
534               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
535                                         L->getHeader())
536                      << "advising against unrolling the loop because it "
537                         "contains a "
538                      << ore::NV("Call", &I);
539             });
540           }
541           return;
542         }
543       }
544     }
545 
546     // Enable runtime and partial unrolling up to the specified size.
547     // Enable using trip count upper bound to unroll loops.
548     UP.Partial = UP.Runtime = UP.UpperBound = true;
549     UP.PartialThreshold = MaxOps;
550 
551     // Avoid unrolling when optimizing for size.
552     UP.OptSizeThreshold = 0;
553     UP.PartialOptSizeThreshold = 0;
554 
555     // Set number of instructions optimized when "back edge"
556     // becomes "fall through" to default value of 2.
557     UP.BEInsns = 2;
558   }
559 
getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)560   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
561                              TTI::PeelingPreferences &PP) {
562     PP.PeelCount = 0;
563     PP.AllowPeeling = true;
564     PP.AllowLoopNestsPeeling = false;
565     PP.PeelProfiledIterations = true;
566   }
567 
isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)568   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
569                                 AssumptionCache &AC,
570                                 TargetLibraryInfo *LibInfo,
571                                 HardwareLoopInfo &HWLoopInfo) {
572     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
573   }
574 
preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)575   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
576                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
577                                    DominatorTree *DT,
578                                    const LoopAccessInfo *LAI) {
579     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
580   }
581 
emitGetActiveLaneMask()582   bool emitGetActiveLaneMask() {
583     return BaseT::emitGetActiveLaneMask();
584   }
585 
instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)586   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
587                                                IntrinsicInst &II) {
588     return BaseT::instCombineIntrinsic(IC, II);
589   }
590 
simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)591   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
592                                                      IntrinsicInst &II,
593                                                      APInt DemandedMask,
594                                                      KnownBits &Known,
595                                                      bool &KnownBitsComputed) {
596     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
597                                                    KnownBitsComputed);
598   }
599 
simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)600   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
601       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
602       APInt &UndefElts2, APInt &UndefElts3,
603       std::function<void(Instruction *, unsigned, APInt, APInt &)>
604           SimplifyAndSetOp) {
605     return BaseT::simplifyDemandedVectorEltsIntrinsic(
606         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
607         SimplifyAndSetOp);
608   }
609 
getInstructionLatency(const Instruction * I)610   InstructionCost getInstructionLatency(const Instruction *I) {
611     if (isa<LoadInst>(I))
612       return getST()->getSchedModel().DefaultLoadLatency;
613 
614     return BaseT::getInstructionLatency(I);
615   }
616 
617   virtual Optional<unsigned>
getCacheSize(TargetTransformInfo::CacheLevel Level)618   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
619     return Optional<unsigned>(
620       getST()->getCacheSize(static_cast<unsigned>(Level)));
621   }
622 
623   virtual Optional<unsigned>
getCacheAssociativity(TargetTransformInfo::CacheLevel Level)624   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
625     Optional<unsigned> TargetResult =
626         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
627 
628     if (TargetResult)
629       return TargetResult;
630 
631     return BaseT::getCacheAssociativity(Level);
632   }
633 
getCacheLineSize()634   virtual unsigned getCacheLineSize() const {
635     return getST()->getCacheLineSize();
636   }
637 
getPrefetchDistance()638   virtual unsigned getPrefetchDistance() const {
639     return getST()->getPrefetchDistance();
640   }
641 
getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)642   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
643                                         unsigned NumStridedMemAccesses,
644                                         unsigned NumPrefetches,
645                                         bool HasCall) const {
646     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
647                                          NumPrefetches, HasCall);
648   }
649 
getMaxPrefetchIterationsAhead()650   virtual unsigned getMaxPrefetchIterationsAhead() const {
651     return getST()->getMaxPrefetchIterationsAhead();
652   }
653 
enableWritePrefetching()654   virtual bool enableWritePrefetching() const {
655     return getST()->enableWritePrefetching();
656   }
657 
658   /// @}
659 
660   /// \name Vector TTI Implementations
661   /// @{
662 
getRegisterBitWidth(TargetTransformInfo::RegisterKind K)663   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
664     return TypeSize::getFixed(32);
665   }
666 
getMaxVScale()667   Optional<unsigned> getMaxVScale() const { return None; }
668 
669   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
670   /// are set if the demanded result elements need to be inserted and/or
671   /// extracted from vectors.
getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)672   InstructionCost getScalarizationOverhead(VectorType *InTy,
673                                            const APInt &DemandedElts,
674                                            bool Insert, bool Extract) {
675     /// FIXME: a bitfield is not a reasonable abstraction for talking about
676     /// which elements are needed from a scalable vector
677     auto *Ty = cast<FixedVectorType>(InTy);
678 
679     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
680            "Vector size mismatch");
681 
682     InstructionCost Cost = 0;
683 
684     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
685       if (!DemandedElts[i])
686         continue;
687       if (Insert)
688         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
689       if (Extract)
690         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
691     }
692 
693     return Cost;
694   }
695 
696   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)697   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
698                                            bool Extract) {
699     auto *Ty = cast<FixedVectorType>(InTy);
700 
701     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
702     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
703   }
704 
705   /// Estimate the overhead of scalarizing an instructions unique
706   /// non-constant operands. The (potentially vector) types to use for each of
707   /// argument are passes via Tys.
getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)708   InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
709                                                    ArrayRef<Type *> Tys) {
710     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
711 
712     InstructionCost Cost = 0;
713     SmallPtrSet<const Value*, 4> UniqueOperands;
714     for (int I = 0, E = Args.size(); I != E; I++) {
715       // Disregard things like metadata arguments.
716       const Value *A = Args[I];
717       Type *Ty = Tys[I];
718       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
719           !Ty->isPtrOrPtrVectorTy())
720         continue;
721 
722       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
723         if (auto *VecTy = dyn_cast<VectorType>(Ty))
724           Cost += getScalarizationOverhead(VecTy, false, true);
725       }
726     }
727 
728     return Cost;
729   }
730 
731   /// Estimate the overhead of scalarizing the inputs and outputs of an
732   /// instruction, with return type RetTy and arguments Args of type Tys. If
733   /// Args are unknown (empty), then the cost associated with one argument is
734   /// added as a heuristic.
getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)735   InstructionCost getScalarizationOverhead(VectorType *RetTy,
736                                            ArrayRef<const Value *> Args,
737                                            ArrayRef<Type *> Tys) {
738     InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
739     if (!Args.empty())
740       Cost += getOperandsScalarizationOverhead(Args, Tys);
741     else
742       // When no information on arguments is provided, we add the cost
743       // associated with one argument as a heuristic.
744       Cost += getScalarizationOverhead(RetTy, false, true);
745 
746     return Cost;
747   }
748 
getMaxInterleaveFactor(unsigned VF)749   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
750 
751   InstructionCost getArithmeticInstrCost(
752       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
753       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
754       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
755       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
756       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
757       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
758       const Instruction *CxtI = nullptr) {
759     // Check if any of the operands are vector operands.
760     const TargetLoweringBase *TLI = getTLI();
761     int ISD = TLI->InstructionOpcodeToISD(Opcode);
762     assert(ISD && "Invalid opcode");
763 
764     // TODO: Handle more cost kinds.
765     if (CostKind != TTI::TCK_RecipThroughput)
766       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
767                                            Opd1Info, Opd2Info,
768                                            Opd1PropInfo, Opd2PropInfo,
769                                            Args, CxtI);
770 
771     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
772 
773     bool IsFloat = Ty->isFPOrFPVectorTy();
774     // Assume that floating point arithmetic operations cost twice as much as
775     // integer operations.
776     InstructionCost OpCost = (IsFloat ? 2 : 1);
777 
778     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
779       // The operation is legal. Assume it costs 1.
780       // TODO: Once we have extract/insert subvector cost we need to use them.
781       return LT.first * OpCost;
782     }
783 
784     if (!TLI->isOperationExpand(ISD, LT.second)) {
785       // If the operation is custom lowered, then assume that the code is twice
786       // as expensive.
787       return LT.first * 2 * OpCost;
788     }
789 
790     // An 'Expand' of URem and SRem is special because it may default
791     // to expanding the operation into a sequence of sub-operations
792     // i.e. X % Y -> X-(X/Y)*Y.
793     if (ISD == ISD::UREM || ISD == ISD::SREM) {
794       bool IsSigned = ISD == ISD::SREM;
795       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
796                                         LT.second) ||
797           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
798                                         LT.second)) {
799         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
800         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
801             DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
802             Opd2PropInfo);
803         InstructionCost MulCost =
804             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
805         InstructionCost SubCost =
806             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
807         return DivCost + MulCost + SubCost;
808       }
809     }
810 
811     // We cannot scalarize scalable vectors, so return Invalid.
812     if (isa<ScalableVectorType>(Ty))
813       return InstructionCost::getInvalid();
814 
815     // Else, assume that we need to scalarize this op.
816     // TODO: If one of the types get legalized by splitting, handle this
817     // similarly to what getCastInstrCost() does.
818     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
819       InstructionCost Cost = thisT()->getArithmeticInstrCost(
820           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
821           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
822       // Return the cost of multiple scalar invocation plus the cost of
823       // inserting and extracting the values.
824       SmallVector<Type *> Tys(Args.size(), Ty);
825       return getScalarizationOverhead(VTy, Args, Tys) +
826              VTy->getNumElements() * Cost;
827     }
828 
829     // We don't know anything about this scalar instruction.
830     return OpCost;
831   }
832 
improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask)833   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
834                                               ArrayRef<int> Mask) const {
835     int Limit = Mask.size() * 2;
836     if (Mask.empty() ||
837         // Extra check required by isSingleSourceMaskImpl function (called by
838         // ShuffleVectorInst::isSingleSourceMask).
839         any_of(Mask, [Limit](int I) { return I >= Limit; }))
840       return Kind;
841     switch (Kind) {
842     case TTI::SK_PermuteSingleSrc:
843       if (ShuffleVectorInst::isReverseMask(Mask))
844         return TTI::SK_Reverse;
845       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
846         return TTI::SK_Broadcast;
847       break;
848     case TTI::SK_PermuteTwoSrc:
849       if (ShuffleVectorInst::isSelectMask(Mask))
850         return TTI::SK_Select;
851       if (ShuffleVectorInst::isTransposeMask(Mask))
852         return TTI::SK_Transpose;
853       break;
854     case TTI::SK_Select:
855     case TTI::SK_Reverse:
856     case TTI::SK_Broadcast:
857     case TTI::SK_Transpose:
858     case TTI::SK_InsertSubvector:
859     case TTI::SK_ExtractSubvector:
860     case TTI::SK_Splice:
861       break;
862     }
863     return Kind;
864   }
865 
getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp)866   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
867                                  ArrayRef<int> Mask, int Index,
868                                  VectorType *SubTp) {
869 
870     switch (improveShuffleKindFromMask(Kind, Mask)) {
871     case TTI::SK_Broadcast:
872       return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp));
873     case TTI::SK_Select:
874     case TTI::SK_Splice:
875     case TTI::SK_Reverse:
876     case TTI::SK_Transpose:
877     case TTI::SK_PermuteSingleSrc:
878     case TTI::SK_PermuteTwoSrc:
879       return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp));
880     case TTI::SK_ExtractSubvector:
881       return getExtractSubvectorOverhead(Tp, Index,
882                                          cast<FixedVectorType>(SubTp));
883     case TTI::SK_InsertSubvector:
884       return getInsertSubvectorOverhead(Tp, Index,
885                                         cast<FixedVectorType>(SubTp));
886     }
887     llvm_unreachable("Unknown TTI::ShuffleKind");
888   }
889 
890   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
891                                    TTI::CastContextHint CCH,
892                                    TTI::TargetCostKind CostKind,
893                                    const Instruction *I = nullptr) {
894     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
895       return 0;
896 
897     const TargetLoweringBase *TLI = getTLI();
898     int ISD = TLI->InstructionOpcodeToISD(Opcode);
899     assert(ISD && "Invalid opcode");
900     std::pair<InstructionCost, MVT> SrcLT =
901         TLI->getTypeLegalizationCost(DL, Src);
902     std::pair<InstructionCost, MVT> DstLT =
903         TLI->getTypeLegalizationCost(DL, Dst);
904 
905     TypeSize SrcSize = SrcLT.second.getSizeInBits();
906     TypeSize DstSize = DstLT.second.getSizeInBits();
907     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
908     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
909 
910     switch (Opcode) {
911     default:
912       break;
913     case Instruction::Trunc:
914       // Check for NOOP conversions.
915       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
916         return 0;
917       LLVM_FALLTHROUGH;
918     case Instruction::BitCast:
919       // Bitcast between types that are legalized to the same type are free and
920       // assume int to/from ptr of the same size is also free.
921       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
922           SrcSize == DstSize)
923         return 0;
924       break;
925     case Instruction::FPExt:
926       if (I && getTLI()->isExtFree(I))
927         return 0;
928       break;
929     case Instruction::ZExt:
930       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
931         return 0;
932       LLVM_FALLTHROUGH;
933     case Instruction::SExt:
934       if (I && getTLI()->isExtFree(I))
935         return 0;
936 
937       // If this is a zext/sext of a load, return 0 if the corresponding
938       // extending load exists on target and the result type is legal.
939       if (CCH == TTI::CastContextHint::Normal) {
940         EVT ExtVT = EVT::getEVT(Dst);
941         EVT LoadVT = EVT::getEVT(Src);
942         unsigned LType =
943           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
944         if (DstLT.first == SrcLT.first &&
945             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
946           return 0;
947       }
948       break;
949     case Instruction::AddrSpaceCast:
950       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
951                                    Dst->getPointerAddressSpace()))
952         return 0;
953       break;
954     }
955 
956     auto *SrcVTy = dyn_cast<VectorType>(Src);
957     auto *DstVTy = dyn_cast<VectorType>(Dst);
958 
959     // If the cast is marked as legal (or promote) then assume low cost.
960     if (SrcLT.first == DstLT.first &&
961         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
962       return SrcLT.first;
963 
964     // Handle scalar conversions.
965     if (!SrcVTy && !DstVTy) {
966       // Just check the op cost. If the operation is legal then assume it costs
967       // 1.
968       if (!TLI->isOperationExpand(ISD, DstLT.second))
969         return 1;
970 
971       // Assume that illegal scalar instruction are expensive.
972       return 4;
973     }
974 
975     // Check vector-to-vector casts.
976     if (DstVTy && SrcVTy) {
977       // If the cast is between same-sized registers, then the check is simple.
978       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
979 
980         // Assume that Zext is done using AND.
981         if (Opcode == Instruction::ZExt)
982           return SrcLT.first;
983 
984         // Assume that sext is done using SHL and SRA.
985         if (Opcode == Instruction::SExt)
986           return SrcLT.first * 2;
987 
988         // Just check the op cost. If the operation is legal then assume it
989         // costs
990         // 1 and multiply by the type-legalization overhead.
991         if (!TLI->isOperationExpand(ISD, DstLT.second))
992           return SrcLT.first * 1;
993       }
994 
995       // If we are legalizing by splitting, query the concrete TTI for the cost
996       // of casting the original vector twice. We also need to factor in the
997       // cost of the split itself. Count that as 1, to be consistent with
998       // TLI->getTypeLegalizationCost().
999       bool SplitSrc =
1000           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1001           TargetLowering::TypeSplitVector;
1002       bool SplitDst =
1003           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1004           TargetLowering::TypeSplitVector;
1005       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1006           DstVTy->getElementCount().isVector()) {
1007         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1008         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1009         T *TTI = static_cast<T *>(this);
1010         // If both types need to be split then the split is free.
1011         InstructionCost SplitCost =
1012             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1013         return SplitCost +
1014                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1015                                           CostKind, I));
1016       }
1017 
1018       // Scalarization cost is Invalid, can't assume any num elements.
1019       if (isa<ScalableVectorType>(DstVTy))
1020         return InstructionCost::getInvalid();
1021 
1022       // In other cases where the source or destination are illegal, assume
1023       // the operation will get scalarized.
1024       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1025       InstructionCost Cost = thisT()->getCastInstrCost(
1026           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1027 
1028       // Return the cost of multiple scalar invocation plus the cost of
1029       // inserting and extracting the values.
1030       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1031     }
1032 
1033     // We already handled vector-to-vector and scalar-to-scalar conversions.
1034     // This
1035     // is where we handle bitcast between vectors and scalars. We need to assume
1036     //  that the conversion is scalarized in one way or another.
1037     if (Opcode == Instruction::BitCast) {
1038       // Illegal bitcasts are done by storing and loading from a stack slot.
1039       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1040              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1041     }
1042 
1043     llvm_unreachable("Unhandled cast");
1044   }
1045 
getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1046   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1047                                            VectorType *VecTy, unsigned Index) {
1048     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1049                                        Index) +
1050            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1051                                      TTI::CastContextHint::None,
1052                                      TTI::TCK_RecipThroughput);
1053   }
1054 
1055   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1056                                  const Instruction *I = nullptr) {
1057     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1058   }
1059 
1060   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1061                                      CmpInst::Predicate VecPred,
1062                                      TTI::TargetCostKind CostKind,
1063                                      const Instruction *I = nullptr) {
1064     const TargetLoweringBase *TLI = getTLI();
1065     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1066     assert(ISD && "Invalid opcode");
1067 
1068     // TODO: Handle other cost kinds.
1069     if (CostKind != TTI::TCK_RecipThroughput)
1070       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1071                                        I);
1072 
1073     // Selects on vectors are actually vector selects.
1074     if (ISD == ISD::SELECT) {
1075       assert(CondTy && "CondTy must exist");
1076       if (CondTy->isVectorTy())
1077         ISD = ISD::VSELECT;
1078     }
1079     std::pair<InstructionCost, MVT> LT =
1080         TLI->getTypeLegalizationCost(DL, ValTy);
1081 
1082     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1083         !TLI->isOperationExpand(ISD, LT.second)) {
1084       // The operation is legal. Assume it costs 1. Multiply
1085       // by the type-legalization overhead.
1086       return LT.first * 1;
1087     }
1088 
1089     // Otherwise, assume that the cast is scalarized.
1090     // TODO: If one of the types get legalized by splitting, handle this
1091     // similarly to what getCastInstrCost() does.
1092     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1093       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1094       if (CondTy)
1095         CondTy = CondTy->getScalarType();
1096       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1097           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1098 
1099       // Return the cost of multiple scalar invocation plus the cost of
1100       // inserting and extracting the values.
1101       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1102     }
1103 
1104     // Unknown scalar opcode.
1105     return 1;
1106   }
1107 
getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1108   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1109                                      unsigned Index) {
1110     std::pair<InstructionCost, MVT> LT =
1111         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1112 
1113     return LT.first;
1114   }
1115 
1116   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1117                                   MaybeAlign Alignment, unsigned AddressSpace,
1118                                   TTI::TargetCostKind CostKind,
1119                                   const Instruction *I = nullptr) {
1120     assert(!Src->isVoidTy() && "Invalid type");
1121     // Assume types, such as structs, are expensive.
1122     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1123       return 4;
1124     std::pair<InstructionCost, MVT> LT =
1125         getTLI()->getTypeLegalizationCost(DL, Src);
1126 
1127     // Assuming that all loads of legal types cost 1.
1128     InstructionCost Cost = LT.first;
1129     if (CostKind != TTI::TCK_RecipThroughput)
1130       return Cost;
1131 
1132     if (Src->isVectorTy() &&
1133         // In practice it's not currently possible to have a change in lane
1134         // length for extending loads or truncating stores so both types should
1135         // have the same scalable property.
1136         TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(),
1137                             LT.second.getSizeInBits())) {
1138       // This is a vector load that legalizes to a larger type than the vector
1139       // itself. Unless the corresponding extending load or truncating store is
1140       // legal, then this will scalarize.
1141       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1142       EVT MemVT = getTLI()->getValueType(DL, Src);
1143       if (Opcode == Instruction::Store)
1144         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1145       else
1146         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1147 
1148       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1149         // This is a vector load/store for some illegal type that is scalarized.
1150         // We must account for the cost of building or decomposing the vector.
1151         Cost += getScalarizationOverhead(cast<VectorType>(Src),
1152                                          Opcode != Instruction::Store,
1153                                          Opcode == Instruction::Store);
1154       }
1155     }
1156 
1157     return Cost;
1158   }
1159 
getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1160   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1161                                         Align Alignment, unsigned AddressSpace,
1162                                         TTI::TargetCostKind CostKind) {
1163     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1164                                        CostKind);
1165   }
1166 
1167   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1168                                          const Value *Ptr, bool VariableMask,
1169                                          Align Alignment,
1170                                          TTI::TargetCostKind CostKind,
1171                                          const Instruction *I = nullptr) {
1172     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1173                                        true, CostKind);
1174   }
1175 
1176   InstructionCost getInterleavedMemoryOpCost(
1177       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1178       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1179       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1180     auto *VT = cast<FixedVectorType>(VecTy);
1181 
1182     unsigned NumElts = VT->getNumElements();
1183     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1184 
1185     unsigned NumSubElts = NumElts / Factor;
1186     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1187 
1188     // Firstly, the cost of load/store operation.
1189     InstructionCost Cost;
1190     if (UseMaskForCond || UseMaskForGaps)
1191       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1192                                             AddressSpace, CostKind);
1193     else
1194       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1195                                       CostKind);
1196 
1197     // Legalize the vector type, and get the legalized and unlegalized type
1198     // sizes.
1199     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1200     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1201     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1202 
1203     // Scale the cost of the memory operation by the fraction of legalized
1204     // instructions that will actually be used. We shouldn't account for the
1205     // cost of dead instructions since they will be removed.
1206     //
1207     // E.g., An interleaved load of factor 8:
1208     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1209     //       %v0 = shufflevector %vec, undef, <0, 8>
1210     //
1211     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1212     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1213     // type). The other loads are unused.
1214     //
1215     // TODO: Note that legalization can turn masked loads/stores into unmasked
1216     // (legalized) loads/stores. This can be reflected in the cost.
1217     if (VecTySize > VecTyLTSize) {
1218       // The number of loads of a legal type it will take to represent a load
1219       // of the unlegalized vector type.
1220       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1221 
1222       // The number of elements of the unlegalized type that correspond to a
1223       // single legal instruction.
1224       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1225 
1226       // Determine which legal instructions will be used.
1227       BitVector UsedInsts(NumLegalInsts, false);
1228       for (unsigned Index : Indices)
1229         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1230           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1231 
1232       // Scale the cost of the load by the fraction of legal instructions that
1233       // will be used.
1234       Cost *= UsedInsts.count() / NumLegalInsts;
1235     }
1236 
1237     // Then plus the cost of interleave operation.
1238     assert(Indices.size() <= Factor &&
1239            "Interleaved memory op has too many members");
1240 
1241     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1242     for (unsigned Index : Indices) {
1243       assert(Index < Factor && "Invalid index for interleaved memory op");
1244       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1245         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1246     }
1247 
1248     if (Opcode == Instruction::Load) {
1249       // The interleave cost is similar to extract sub vectors' elements
1250       // from the wide vector, and insert them into sub vectors.
1251       //
1252       // E.g. An interleaved load of factor 2 (with one member of index 0):
1253       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1254       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1255       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1256       // <8 x i32> vector and insert them into a <4 x i32> vector.
1257       InstructionCost InsSubCost =
1258           getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false);
1259       Cost += Indices.size() * InsSubCost;
1260       Cost +=
1261           thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1262                                             /*Insert*/ false, /*Extract*/ true);
1263     } else {
1264       // The interleave cost is extract elements from sub vectors, and
1265       // insert them into the wide vector.
1266       //
1267       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1268       // (using VF=4):
1269       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1270       //    %gaps.mask = <true, true, false, true, true, false,
1271       //                  true, true, false, true, true, false>
1272       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1273       //                           i32 Align, <12 x i1> %gaps.mask
1274       // The cost is estimated as extract all elements (of actual members,
1275       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1276       // i32> vector.
1277       InstructionCost ExtSubCost =
1278           getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
1279       Cost += ExtSubCost * Indices.size();
1280       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1281                                                 /*Insert*/ true,
1282                                                 /*Extract*/ false);
1283     }
1284 
1285     if (!UseMaskForCond)
1286       return Cost;
1287 
1288     Type *I8Type = Type::getInt8Ty(VT->getContext());
1289     auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1290     SubVT = FixedVectorType::get(I8Type, NumSubElts);
1291 
1292     // The Mask shuffling cost is extract all the elements of the Mask
1293     // and insert each of them Factor times into the wide vector:
1294     //
1295     // E.g. an interleaved group with factor 3:
1296     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1297     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1298     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1299     // The cost is estimated as extract all mask elements from the <8xi1> mask
1300     // vector and insert them factor times into the <24xi1> shuffled mask
1301     // vector.
1302     Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true);
1303     Cost +=
1304         getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false);
1305 
1306     // The Gaps mask is invariant and created outside the loop, therefore the
1307     // cost of creating it is not accounted for here. However if we have both
1308     // a MaskForGaps and some other mask that guards the execution of the
1309     // memory access, we need to account for the cost of And-ing the two masks
1310     // inside the loop.
1311     if (UseMaskForGaps)
1312       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1313                                               CostKind);
1314 
1315     return Cost;
1316   }
1317 
1318   /// Get intrinsic cost based on arguments.
getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1319   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1320                                         TTI::TargetCostKind CostKind) {
1321     // Check for generically free intrinsics.
1322     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1323       return 0;
1324 
1325     // Assume that target intrinsics are cheap.
1326     Intrinsic::ID IID = ICA.getID();
1327     if (Function::isTargetIntrinsic(IID))
1328       return TargetTransformInfo::TCC_Basic;
1329 
1330     if (ICA.isTypeBasedOnly())
1331       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1332 
1333     Type *RetTy = ICA.getReturnType();
1334 
1335     ElementCount RetVF =
1336         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1337                              : ElementCount::getFixed(1));
1338     const IntrinsicInst *I = ICA.getInst();
1339     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1340     FastMathFlags FMF = ICA.getFlags();
1341     switch (IID) {
1342     default:
1343       break;
1344 
1345     case Intrinsic::cttz:
1346       // FIXME: If necessary, this should go in target-specific overrides.
1347       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1348         return TargetTransformInfo::TCC_Basic;
1349       break;
1350 
1351     case Intrinsic::ctlz:
1352       // FIXME: If necessary, this should go in target-specific overrides.
1353       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1354         return TargetTransformInfo::TCC_Basic;
1355       break;
1356 
1357     case Intrinsic::memcpy:
1358       return thisT()->getMemcpyCost(ICA.getInst());
1359 
1360     case Intrinsic::masked_scatter: {
1361       const Value *Mask = Args[3];
1362       bool VarMask = !isa<Constant>(Mask);
1363       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1364       return thisT()->getGatherScatterOpCost(Instruction::Store,
1365                                              ICA.getArgTypes()[0], Args[1],
1366                                              VarMask, Alignment, CostKind, I);
1367     }
1368     case Intrinsic::masked_gather: {
1369       const Value *Mask = Args[2];
1370       bool VarMask = !isa<Constant>(Mask);
1371       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1372       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1373                                              VarMask, Alignment, CostKind, I);
1374     }
1375     case Intrinsic::experimental_stepvector: {
1376       if (isa<ScalableVectorType>(RetTy))
1377         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1378       // The cost of materialising a constant integer vector.
1379       return TargetTransformInfo::TCC_Basic;
1380     }
1381     case Intrinsic::experimental_vector_extract: {
1382       // FIXME: Handle case where a scalable vector is extracted from a scalable
1383       // vector
1384       if (isa<ScalableVectorType>(RetTy))
1385         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1386       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1387       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1388                                      cast<VectorType>(Args[0]->getType()), None,
1389                                      Index, cast<VectorType>(RetTy));
1390     }
1391     case Intrinsic::experimental_vector_insert: {
1392       // FIXME: Handle case where a scalable vector is inserted into a scalable
1393       // vector
1394       if (isa<ScalableVectorType>(Args[1]->getType()))
1395         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1396       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1397       return thisT()->getShuffleCost(
1398           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1399           Index, cast<VectorType>(Args[1]->getType()));
1400     }
1401     case Intrinsic::experimental_vector_reverse: {
1402       return thisT()->getShuffleCost(TTI::SK_Reverse,
1403                                      cast<VectorType>(Args[0]->getType()), None,
1404                                      0, cast<VectorType>(RetTy));
1405     }
1406     case Intrinsic::experimental_vector_splice: {
1407       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1408       return thisT()->getShuffleCost(TTI::SK_Splice,
1409                                      cast<VectorType>(Args[0]->getType()), None,
1410                                      Index, cast<VectorType>(RetTy));
1411     }
1412     case Intrinsic::vector_reduce_add:
1413     case Intrinsic::vector_reduce_mul:
1414     case Intrinsic::vector_reduce_and:
1415     case Intrinsic::vector_reduce_or:
1416     case Intrinsic::vector_reduce_xor:
1417     case Intrinsic::vector_reduce_smax:
1418     case Intrinsic::vector_reduce_smin:
1419     case Intrinsic::vector_reduce_fmax:
1420     case Intrinsic::vector_reduce_fmin:
1421     case Intrinsic::vector_reduce_umax:
1422     case Intrinsic::vector_reduce_umin: {
1423       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1424       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1425     }
1426     case Intrinsic::vector_reduce_fadd:
1427     case Intrinsic::vector_reduce_fmul: {
1428       IntrinsicCostAttributes Attrs(
1429           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1430       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1431     }
1432     case Intrinsic::fshl:
1433     case Intrinsic::fshr: {
1434       if (isa<ScalableVectorType>(RetTy))
1435         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1436       const Value *X = Args[0];
1437       const Value *Y = Args[1];
1438       const Value *Z = Args[2];
1439       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1440       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1441       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1442       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1443       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1444       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1445                                                               : TTI::OP_None;
1446       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1447       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1448       InstructionCost Cost = 0;
1449       Cost +=
1450           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1451       Cost +=
1452           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1453       Cost += thisT()->getArithmeticInstrCost(
1454           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1455       Cost += thisT()->getArithmeticInstrCost(
1456           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1457       // Non-constant shift amounts requires a modulo.
1458       if (OpKindZ != TTI::OK_UniformConstantValue &&
1459           OpKindZ != TTI::OK_NonUniformConstantValue)
1460         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1461                                                 CostKind, OpKindZ, OpKindBW,
1462                                                 OpPropsZ, OpPropsBW);
1463       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1464       if (X != Y) {
1465         Type *CondTy = RetTy->getWithNewBitWidth(1);
1466         Cost +=
1467             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1468                                         CmpInst::ICMP_EQ, CostKind);
1469         Cost +=
1470             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1471                                         CmpInst::ICMP_EQ, CostKind);
1472       }
1473       return Cost;
1474     }
1475     }
1476 
1477     // Assume that we need to scalarize this intrinsic.
1478     // Compute the scalarization overhead based on Args for a vector
1479     // intrinsic.
1480     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1481     if (RetVF.isVector() && !RetVF.isScalable()) {
1482       ScalarizationCost = 0;
1483       if (!RetTy->isVoidTy())
1484         ScalarizationCost +=
1485             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1486       ScalarizationCost +=
1487           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1488     }
1489 
1490     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1491                                   ScalarizationCost);
1492     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1493   }
1494 
1495   /// Get intrinsic cost based on argument types.
1496   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1497   /// cost of scalarizing the arguments and the return value will be computed
1498   /// based on types.
1499   InstructionCost
getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1500   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1501                                  TTI::TargetCostKind CostKind) {
1502     Intrinsic::ID IID = ICA.getID();
1503     Type *RetTy = ICA.getReturnType();
1504     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1505     FastMathFlags FMF = ICA.getFlags();
1506     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1507     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1508 
1509     VectorType *VecOpTy = nullptr;
1510     if (!Tys.empty()) {
1511       // The vector reduction operand is operand 0 except for fadd/fmul.
1512       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1513       unsigned VecTyIndex = 0;
1514       if (IID == Intrinsic::vector_reduce_fadd ||
1515           IID == Intrinsic::vector_reduce_fmul)
1516         VecTyIndex = 1;
1517       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1518       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1519     }
1520 
1521     // Library call cost - other than size, make it expensive.
1522     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1523     SmallVector<unsigned, 2> ISDs;
1524     switch (IID) {
1525     default: {
1526       // Scalable vectors cannot be scalarized, so return Invalid.
1527       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1528             return isa<ScalableVectorType>(Ty);
1529           }))
1530         return InstructionCost::getInvalid();
1531 
1532       // Assume that we need to scalarize this intrinsic.
1533       InstructionCost ScalarizationCost =
1534           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1535       unsigned ScalarCalls = 1;
1536       Type *ScalarRetTy = RetTy;
1537       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1538         if (!SkipScalarizationCost)
1539           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1540         ScalarCalls = std::max(ScalarCalls,
1541                                cast<FixedVectorType>(RetVTy)->getNumElements());
1542         ScalarRetTy = RetTy->getScalarType();
1543       }
1544       SmallVector<Type *, 4> ScalarTys;
1545       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1546         Type *Ty = Tys[i];
1547         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1548           if (!SkipScalarizationCost)
1549             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1550           ScalarCalls = std::max(ScalarCalls,
1551                                  cast<FixedVectorType>(VTy)->getNumElements());
1552           Ty = Ty->getScalarType();
1553         }
1554         ScalarTys.push_back(Ty);
1555       }
1556       if (ScalarCalls == 1)
1557         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1558 
1559       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1560       InstructionCost ScalarCost =
1561           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1562 
1563       return ScalarCalls * ScalarCost + ScalarizationCost;
1564     }
1565     // Look for intrinsics that can be lowered directly or turned into a scalar
1566     // intrinsic call.
1567     case Intrinsic::sqrt:
1568       ISDs.push_back(ISD::FSQRT);
1569       break;
1570     case Intrinsic::sin:
1571       ISDs.push_back(ISD::FSIN);
1572       break;
1573     case Intrinsic::cos:
1574       ISDs.push_back(ISD::FCOS);
1575       break;
1576     case Intrinsic::exp:
1577       ISDs.push_back(ISD::FEXP);
1578       break;
1579     case Intrinsic::exp2:
1580       ISDs.push_back(ISD::FEXP2);
1581       break;
1582     case Intrinsic::log:
1583       ISDs.push_back(ISD::FLOG);
1584       break;
1585     case Intrinsic::log10:
1586       ISDs.push_back(ISD::FLOG10);
1587       break;
1588     case Intrinsic::log2:
1589       ISDs.push_back(ISD::FLOG2);
1590       break;
1591     case Intrinsic::fabs:
1592       ISDs.push_back(ISD::FABS);
1593       break;
1594     case Intrinsic::canonicalize:
1595       ISDs.push_back(ISD::FCANONICALIZE);
1596       break;
1597     case Intrinsic::minnum:
1598       ISDs.push_back(ISD::FMINNUM);
1599       break;
1600     case Intrinsic::maxnum:
1601       ISDs.push_back(ISD::FMAXNUM);
1602       break;
1603     case Intrinsic::minimum:
1604       ISDs.push_back(ISD::FMINIMUM);
1605       break;
1606     case Intrinsic::maximum:
1607       ISDs.push_back(ISD::FMAXIMUM);
1608       break;
1609     case Intrinsic::copysign:
1610       ISDs.push_back(ISD::FCOPYSIGN);
1611       break;
1612     case Intrinsic::floor:
1613       ISDs.push_back(ISD::FFLOOR);
1614       break;
1615     case Intrinsic::ceil:
1616       ISDs.push_back(ISD::FCEIL);
1617       break;
1618     case Intrinsic::trunc:
1619       ISDs.push_back(ISD::FTRUNC);
1620       break;
1621     case Intrinsic::nearbyint:
1622       ISDs.push_back(ISD::FNEARBYINT);
1623       break;
1624     case Intrinsic::rint:
1625       ISDs.push_back(ISD::FRINT);
1626       break;
1627     case Intrinsic::round:
1628       ISDs.push_back(ISD::FROUND);
1629       break;
1630     case Intrinsic::roundeven:
1631       ISDs.push_back(ISD::FROUNDEVEN);
1632       break;
1633     case Intrinsic::pow:
1634       ISDs.push_back(ISD::FPOW);
1635       break;
1636     case Intrinsic::fma:
1637       ISDs.push_back(ISD::FMA);
1638       break;
1639     case Intrinsic::fmuladd:
1640       ISDs.push_back(ISD::FMA);
1641       break;
1642     case Intrinsic::experimental_constrained_fmuladd:
1643       ISDs.push_back(ISD::STRICT_FMA);
1644       break;
1645     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1646     case Intrinsic::lifetime_start:
1647     case Intrinsic::lifetime_end:
1648     case Intrinsic::sideeffect:
1649     case Intrinsic::pseudoprobe:
1650     case Intrinsic::arithmetic_fence:
1651       return 0;
1652     case Intrinsic::masked_store: {
1653       Type *Ty = Tys[0];
1654       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1655       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1656                                             CostKind);
1657     }
1658     case Intrinsic::masked_load: {
1659       Type *Ty = RetTy;
1660       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1661       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1662                                             CostKind);
1663     }
1664     case Intrinsic::vector_reduce_add:
1665       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1666                                                  None, CostKind);
1667     case Intrinsic::vector_reduce_mul:
1668       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1669                                                  None, CostKind);
1670     case Intrinsic::vector_reduce_and:
1671       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1672                                                  None, CostKind);
1673     case Intrinsic::vector_reduce_or:
1674       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1675                                                  CostKind);
1676     case Intrinsic::vector_reduce_xor:
1677       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1678                                                  None, CostKind);
1679     case Intrinsic::vector_reduce_fadd:
1680       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1681                                                  FMF, CostKind);
1682     case Intrinsic::vector_reduce_fmul:
1683       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1684                                                  FMF, CostKind);
1685     case Intrinsic::vector_reduce_smax:
1686     case Intrinsic::vector_reduce_smin:
1687     case Intrinsic::vector_reduce_fmax:
1688     case Intrinsic::vector_reduce_fmin:
1689       return thisT()->getMinMaxReductionCost(
1690           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1691           /*IsUnsigned=*/false, CostKind);
1692     case Intrinsic::vector_reduce_umax:
1693     case Intrinsic::vector_reduce_umin:
1694       return thisT()->getMinMaxReductionCost(
1695           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1696           /*IsUnsigned=*/true, CostKind);
1697     case Intrinsic::abs: {
1698       // abs(X) = select(icmp(X,0),X,sub(0,X))
1699       Type *CondTy = RetTy->getWithNewBitWidth(1);
1700       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1701       InstructionCost Cost = 0;
1702       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1703                                           Pred, CostKind);
1704       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1705                                           Pred, CostKind);
1706       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1707       Cost += thisT()->getArithmeticInstrCost(
1708           BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1709       return Cost;
1710     }
1711     case Intrinsic::smax:
1712     case Intrinsic::smin:
1713     case Intrinsic::umax:
1714     case Intrinsic::umin: {
1715       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1716       Type *CondTy = RetTy->getWithNewBitWidth(1);
1717       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1718       CmpInst::Predicate Pred =
1719           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1720       InstructionCost Cost = 0;
1721       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1722                                           Pred, CostKind);
1723       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1724                                           Pred, CostKind);
1725       return Cost;
1726     }
1727     case Intrinsic::sadd_sat:
1728     case Intrinsic::ssub_sat: {
1729       Type *CondTy = RetTy->getWithNewBitWidth(1);
1730 
1731       Type *OpTy = StructType::create({RetTy, CondTy});
1732       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1733                                      ? Intrinsic::sadd_with_overflow
1734                                      : Intrinsic::ssub_with_overflow;
1735       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1736 
1737       // SatMax -> Overflow && SumDiff < 0
1738       // SatMin -> Overflow && SumDiff >= 0
1739       InstructionCost Cost = 0;
1740       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1741                                     nullptr, ScalarizationCostPassed);
1742       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1743       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1744                                           Pred, CostKind);
1745       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1746                                               CondTy, Pred, CostKind);
1747       return Cost;
1748     }
1749     case Intrinsic::uadd_sat:
1750     case Intrinsic::usub_sat: {
1751       Type *CondTy = RetTy->getWithNewBitWidth(1);
1752 
1753       Type *OpTy = StructType::create({RetTy, CondTy});
1754       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1755                                      ? Intrinsic::uadd_with_overflow
1756                                      : Intrinsic::usub_with_overflow;
1757 
1758       InstructionCost Cost = 0;
1759       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1760                                     nullptr, ScalarizationCostPassed);
1761       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1762       Cost +=
1763           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1764                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1765       return Cost;
1766     }
1767     case Intrinsic::smul_fix:
1768     case Intrinsic::umul_fix: {
1769       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1770       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1771 
1772       unsigned ExtOp =
1773           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1774       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1775 
1776       InstructionCost Cost = 0;
1777       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1778       Cost +=
1779           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1780       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1781                                             CCH, CostKind);
1782       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1783                                               CostKind, TTI::OK_AnyValue,
1784                                               TTI::OK_UniformConstantValue);
1785       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1786                                               TTI::OK_AnyValue,
1787                                               TTI::OK_UniformConstantValue);
1788       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1789       return Cost;
1790     }
1791     case Intrinsic::sadd_with_overflow:
1792     case Intrinsic::ssub_with_overflow: {
1793       Type *SumTy = RetTy->getContainedType(0);
1794       Type *OverflowTy = RetTy->getContainedType(1);
1795       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1796                             ? BinaryOperator::Add
1797                             : BinaryOperator::Sub;
1798 
1799       //   Add:
1800       //   Overflow -> (Result < LHS) ^ (RHS < 0)
1801       //   Sub:
1802       //   Overflow -> (Result < LHS) ^ (RHS > 0)
1803       InstructionCost Cost = 0;
1804       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1805       Cost += 2 * thisT()->getCmpSelInstrCost(
1806                       Instruction::ICmp, SumTy, OverflowTy,
1807                       CmpInst::ICMP_SGT, CostKind);
1808       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
1809                                               CostKind);
1810       return Cost;
1811     }
1812     case Intrinsic::uadd_with_overflow:
1813     case Intrinsic::usub_with_overflow: {
1814       Type *SumTy = RetTy->getContainedType(0);
1815       Type *OverflowTy = RetTy->getContainedType(1);
1816       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1817                             ? BinaryOperator::Add
1818                             : BinaryOperator::Sub;
1819       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
1820                                     ? CmpInst::ICMP_ULT
1821                                     : CmpInst::ICMP_UGT;
1822 
1823       InstructionCost Cost = 0;
1824       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1825       Cost +=
1826           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1827                                       Pred, CostKind);
1828       return Cost;
1829     }
1830     case Intrinsic::smul_with_overflow:
1831     case Intrinsic::umul_with_overflow: {
1832       Type *MulTy = RetTy->getContainedType(0);
1833       Type *OverflowTy = RetTy->getContainedType(1);
1834       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1835       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1836       bool IsSigned = IID == Intrinsic::smul_with_overflow;
1837 
1838       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
1839       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1840 
1841       InstructionCost Cost = 0;
1842       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1843       Cost +=
1844           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1845       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1846                                             CCH, CostKind);
1847       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
1848                                               CostKind, TTI::OK_AnyValue,
1849                                               TTI::OK_UniformConstantValue);
1850 
1851       if (IsSigned)
1852         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1853                                                 CostKind, TTI::OK_AnyValue,
1854                                                 TTI::OK_UniformConstantValue);
1855 
1856       Cost += thisT()->getCmpSelInstrCost(
1857           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
1858       return Cost;
1859     }
1860     case Intrinsic::ctpop:
1861       ISDs.push_back(ISD::CTPOP);
1862       // In case of legalization use TCC_Expensive. This is cheaper than a
1863       // library call but still not a cheap instruction.
1864       SingleCallCost = TargetTransformInfo::TCC_Expensive;
1865       break;
1866     case Intrinsic::ctlz:
1867       ISDs.push_back(ISD::CTLZ);
1868       break;
1869     case Intrinsic::cttz:
1870       ISDs.push_back(ISD::CTTZ);
1871       break;
1872     case Intrinsic::bswap:
1873       ISDs.push_back(ISD::BSWAP);
1874       break;
1875     case Intrinsic::bitreverse:
1876       ISDs.push_back(ISD::BITREVERSE);
1877       break;
1878     }
1879 
1880     const TargetLoweringBase *TLI = getTLI();
1881     std::pair<InstructionCost, MVT> LT =
1882         TLI->getTypeLegalizationCost(DL, RetTy);
1883 
1884     SmallVector<InstructionCost, 2> LegalCost;
1885     SmallVector<InstructionCost, 2> CustomCost;
1886     for (unsigned ISD : ISDs) {
1887       if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
1888         if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
1889             TLI->isFAbsFree(LT.second)) {
1890           return 0;
1891         }
1892 
1893         // The operation is legal. Assume it costs 1.
1894         // If the type is split to multiple registers, assume that there is some
1895         // overhead to this.
1896         // TODO: Once we have extract/insert subvector cost we need to use them.
1897         if (LT.first > 1)
1898           LegalCost.push_back(LT.first * 2);
1899         else
1900           LegalCost.push_back(LT.first * 1);
1901       } else if (!TLI->isOperationExpand(ISD, LT.second)) {
1902         // If the operation is custom lowered then assume
1903         // that the code is twice as expensive.
1904         CustomCost.push_back(LT.first * 2);
1905       }
1906     }
1907 
1908     auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end());
1909     if (MinLegalCostI != LegalCost.end())
1910       return *MinLegalCostI;
1911 
1912     auto MinCustomCostI =
1913         std::min_element(CustomCost.begin(), CustomCost.end());
1914     if (MinCustomCostI != CustomCost.end())
1915       return *MinCustomCostI;
1916 
1917     // If we can't lower fmuladd into an FMA estimate the cost as a floating
1918     // point mul followed by an add.
1919     if (IID == Intrinsic::fmuladd)
1920       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
1921                                              CostKind) +
1922              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
1923                                              CostKind);
1924     if (IID == Intrinsic::experimental_constrained_fmuladd) {
1925       IntrinsicCostAttributes FMulAttrs(
1926         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
1927       IntrinsicCostAttributes FAddAttrs(
1928         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
1929       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
1930              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
1931     }
1932 
1933     // Else, assume that we need to scalarize this intrinsic. For math builtins
1934     // this will emit a costly libcall, adding call overhead and spills. Make it
1935     // very expensive.
1936     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1937       // Scalable vectors cannot be scalarized, so return Invalid.
1938       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1939             return isa<ScalableVectorType>(Ty);
1940           }))
1941         return InstructionCost::getInvalid();
1942 
1943       InstructionCost ScalarizationCost =
1944           SkipScalarizationCost ? ScalarizationCostPassed
1945                                 : getScalarizationOverhead(RetVTy, true, false);
1946 
1947       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
1948       SmallVector<Type *, 4> ScalarTys;
1949       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1950         Type *Ty = Tys[i];
1951         if (Ty->isVectorTy())
1952           Ty = Ty->getScalarType();
1953         ScalarTys.push_back(Ty);
1954       }
1955       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
1956       InstructionCost ScalarCost =
1957           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1958       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1959         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
1960           if (!ICA.skipScalarizationCost())
1961             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1962           ScalarCalls = std::max(ScalarCalls,
1963                                  cast<FixedVectorType>(VTy)->getNumElements());
1964         }
1965       }
1966       return ScalarCalls * ScalarCost + ScalarizationCost;
1967     }
1968 
1969     // This is going to be turned into a library call, make it expensive.
1970     return SingleCallCost;
1971   }
1972 
1973   /// Compute a cost of the given call instruction.
1974   ///
1975   /// Compute the cost of calling function F with return type RetTy and
1976   /// argument types Tys. F might be nullptr, in this case the cost of an
1977   /// arbitrary call with the specified signature will be returned.
1978   /// This is used, for instance,  when we estimate call of a vector
1979   /// counterpart of the given function.
1980   /// \param F Called function, might be nullptr.
1981   /// \param RetTy Return value types.
1982   /// \param Tys Argument types.
1983   /// \returns The cost of Call instruction.
getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)1984   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
1985                                    ArrayRef<Type *> Tys,
1986                                    TTI::TargetCostKind CostKind) {
1987     return 10;
1988   }
1989 
getNumberOfParts(Type * Tp)1990   unsigned getNumberOfParts(Type *Tp) {
1991     std::pair<InstructionCost, MVT> LT =
1992         getTLI()->getTypeLegalizationCost(DL, Tp);
1993     return *LT.first.getValue();
1994   }
1995 
getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1996   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
1997                                             const SCEV *) {
1998     return 0;
1999   }
2000 
2001   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2002   /// We're assuming that reduction operation are performing the following way:
2003   ///
2004   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2005   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2006   ///            \----------------v-------------/  \----------v------------/
2007   ///                            n/2 elements               n/2 elements
2008   /// %red1 = op <n x t> %val, <n x t> val1
2009   /// After this operation we have a vector %red1 where only the first n/2
2010   /// elements are meaningful, the second n/2 elements are undefined and can be
2011   /// dropped. All other operations are actually working with the vector of
2012   /// length n/2, not n, though the real vector length is still n.
2013   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2014   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2015   ///            \----------------v-------------/  \----------v------------/
2016   ///                            n/4 elements               3*n/4 elements
2017   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2018   /// length n/2, the resulting vector has length n/4 etc.
2019   ///
2020   /// The cost model should take into account that the actual length of the
2021   /// vector is reduced on each iteration.
getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2022   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2023                                        TTI::TargetCostKind CostKind) {
2024     Type *ScalarTy = Ty->getElementType();
2025     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2026     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2027         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2028         NumVecElts >= 2) {
2029       // Or reduction for i1 is represented as:
2030       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2031       // %res = cmp ne iReduxWidth %val, 0
2032       // And reduction for i1 is represented as:
2033       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2034       // %res = cmp eq iReduxWidth %val, 11111
2035       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2036       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2037                                        TTI::CastContextHint::None, CostKind) +
2038              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2039                                          CmpInst::makeCmpResultType(ValTy),
2040                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2041     }
2042     unsigned NumReduxLevels = Log2_32(NumVecElts);
2043     InstructionCost ArithCost = 0;
2044     InstructionCost ShuffleCost = 0;
2045     std::pair<InstructionCost, MVT> LT =
2046         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2047     unsigned LongVectorCount = 0;
2048     unsigned MVTLen =
2049         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2050     while (NumVecElts > MVTLen) {
2051       NumVecElts /= 2;
2052       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2053       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2054                                              NumVecElts, SubTy);
2055       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2056       Ty = SubTy;
2057       ++LongVectorCount;
2058     }
2059 
2060     NumReduxLevels -= LongVectorCount;
2061 
2062     // The minimal length of the vector is limited by the real length of vector
2063     // operations performed on the current platform. That's why several final
2064     // reduction operations are performed on the vectors with the same
2065     // architecture-dependent length.
2066 
2067     // By default reductions need one shuffle per reduction level.
2068     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2069                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2070     ArithCost +=
2071         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2072     return ShuffleCost + ArithCost +
2073            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2074   }
2075 
2076   /// Try to calculate the cost of performing strict (in-order) reductions,
2077   /// which involves doing a sequence of floating point additions in lane
2078   /// order, starting with an initial value. For example, consider a scalar
2079   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2080   ///
2081   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2082   ///
2083   ///   %add1 = %InitVal + %v0
2084   ///   %add2 = %add1 + %v1
2085   ///   %add3 = %add2 + %v2
2086   ///   %add4 = %add3 + %v3
2087   ///
2088   /// As a simple estimate we can say the cost of such a reduction is 4 times
2089   /// the cost of a scalar FP addition. We can only estimate the costs for
2090   /// fixed-width vectors here because for scalable vectors we do not know the
2091   /// runtime number of operations.
getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2092   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2093                                           TTI::TargetCostKind CostKind) {
2094     // Targets must implement a default value for the scalable case, since
2095     // we don't know how many lanes the vector has.
2096     if (isa<ScalableVectorType>(Ty))
2097       return InstructionCost::getInvalid();
2098 
2099     auto *VTy = cast<FixedVectorType>(Ty);
2100     InstructionCost ExtractCost =
2101         getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2102     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2103         Opcode, VTy->getElementType(), CostKind);
2104     ArithCost *= VTy->getNumElements();
2105 
2106     return ExtractCost + ArithCost;
2107   }
2108 
getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2109   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2110                                              Optional<FastMathFlags> FMF,
2111                                              TTI::TargetCostKind CostKind) {
2112     if (TTI::requiresOrderedReduction(FMF))
2113       return getOrderedReductionCost(Opcode, Ty, CostKind);
2114     return getTreeReductionCost(Opcode, Ty, CostKind);
2115   }
2116 
2117   /// Try to calculate op costs for min/max reduction operations.
2118   /// \param CondTy Conditional type for the Select instruction.
getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)2119   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2120                                          bool IsUnsigned,
2121                                          TTI::TargetCostKind CostKind) {
2122     Type *ScalarTy = Ty->getElementType();
2123     Type *ScalarCondTy = CondTy->getElementType();
2124     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2125     unsigned NumReduxLevels = Log2_32(NumVecElts);
2126     unsigned CmpOpcode;
2127     if (Ty->isFPOrFPVectorTy()) {
2128       CmpOpcode = Instruction::FCmp;
2129     } else {
2130       assert(Ty->isIntOrIntVectorTy() &&
2131              "expecting floating point or integer type for min/max reduction");
2132       CmpOpcode = Instruction::ICmp;
2133     }
2134     InstructionCost MinMaxCost = 0;
2135     InstructionCost ShuffleCost = 0;
2136     std::pair<InstructionCost, MVT> LT =
2137         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2138     unsigned LongVectorCount = 0;
2139     unsigned MVTLen =
2140         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2141     while (NumVecElts > MVTLen) {
2142       NumVecElts /= 2;
2143       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2144       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2145 
2146       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2147                                              NumVecElts, SubTy);
2148       MinMaxCost +=
2149           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2150                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2151           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2152                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2153       Ty = SubTy;
2154       ++LongVectorCount;
2155     }
2156 
2157     NumReduxLevels -= LongVectorCount;
2158 
2159     // The minimal length of the vector is limited by the real length of vector
2160     // operations performed on the current platform. That's why several final
2161     // reduction opertions are perfomed on the vectors with the same
2162     // architecture-dependent length.
2163     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2164                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2165     MinMaxCost +=
2166         NumReduxLevels *
2167         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2168                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2169          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2170                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2171     // The last min/max should be in vector registers and we counted it above.
2172     // So just need a single extractelement.
2173     return ShuffleCost + MinMaxCost +
2174            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2175   }
2176 
getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2177   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2178                                               Type *ResTy, VectorType *Ty,
2179                                               TTI::TargetCostKind CostKind) {
2180     // Without any native support, this is equivalent to the cost of
2181     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2182     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2183     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2184         Instruction::Add, ExtTy, None, CostKind);
2185     InstructionCost MulCost = 0;
2186     InstructionCost ExtCost = thisT()->getCastInstrCost(
2187         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2188         TTI::CastContextHint::None, CostKind);
2189     if (IsMLA) {
2190       MulCost =
2191           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2192       ExtCost *= 2;
2193     }
2194 
2195     return RedCost + MulCost + ExtCost;
2196   }
2197 
getVectorSplitCost()2198   InstructionCost getVectorSplitCost() { return 1; }
2199 
2200   /// @}
2201 };
2202 
2203 /// Concrete BasicTTIImpl that can be used if no further customization
2204 /// is needed.
2205 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2206   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2207 
2208   friend class BasicTTIImplBase<BasicTTIImpl>;
2209 
2210   const TargetSubtargetInfo *ST;
2211   const TargetLoweringBase *TLI;
2212 
getST()2213   const TargetSubtargetInfo *getST() const { return ST; }
getTLI()2214   const TargetLoweringBase *getTLI() const { return TLI; }
2215 
2216 public:
2217   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2218 };
2219 
2220 } // end namespace llvm
2221 
2222 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2223