1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/Constant.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MachineValueType.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include <algorithm>
51 #include <cassert>
52 #include <cstdint>
53 #include <limits>
54 #include <utility>
55 
56 namespace llvm {
57 
58 class Function;
59 class GlobalValue;
60 class LLVMContext;
61 class ScalarEvolution;
62 class SCEV;
63 class TargetMachine;
64 
65 extern cl::opt<unsigned> PartialUnrollingThreshold;
66 
67 /// Base class which can be used to help build a TTI implementation.
68 ///
69 /// This class provides as much implementation of the TTI interface as is
70 /// possible using the target independent parts of the code generator.
71 ///
72 /// In order to subclass it, your class must implement a getST() method to
73 /// return the subtarget, and a getTLI() method to return the target lowering.
74 /// We need these methods implemented in the derived class so that this class
75 /// doesn't have to duplicate storage for them.
76 template <typename T>
77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
78 private:
79   using BaseT = TargetTransformInfoImplCRTPBase<T>;
80   using TTI = TargetTransformInfo;
81 
82   /// Helper function to access this as a T.
83   T *thisT() { return static_cast<T *>(this); }
84 
85   /// Estimate a cost of Broadcast as an extract and sequence of insert
86   /// operations.
87   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) {
88     InstructionCost Cost = 0;
89     // Broadcast cost is equal to the cost of extracting the zero'th element
90     // plus the cost of inserting it into every element of the result vector.
91     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0);
92 
93     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
94       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
95     }
96     return Cost;
97   }
98 
99   /// Estimate a cost of shuffle as a sequence of extract and insert
100   /// operations.
101   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) {
102     InstructionCost Cost = 0;
103     // Shuffle cost is equal to the cost of extracting element from its argument
104     // plus the cost of inserting them onto the result vector.
105 
106     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
107     // index 0 of first vector, index 1 of second vector,index 2 of first
108     // vector and finally index 3 of second vector and insert them at index
109     // <0,1,2,3> of result vector.
110     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
111       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i);
112       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i);
113     }
114     return Cost;
115   }
116 
117   /// Estimate a cost of subvector extraction as a sequence of extract and
118   /// insert operations.
119   InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index,
120                                        FixedVectorType *SubVTy) {
121     assert(VTy && SubVTy &&
122            "Can only extract subvectors from vectors");
123     int NumSubElts = SubVTy->getNumElements();
124     assert((!isa<FixedVectorType>(VTy) ||
125             (Index + NumSubElts) <=
126                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
127            "SK_ExtractSubvector index out of range");
128 
129     InstructionCost Cost = 0;
130     // Subvector extraction cost is equal to the cost of extracting element from
131     // the source type plus the cost of inserting them into the result vector
132     // type.
133     for (int i = 0; i != NumSubElts; ++i) {
134       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
135                                           i + Index);
136       Cost +=
137           thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i);
138     }
139     return Cost;
140   }
141 
142   /// Estimate a cost of subvector insertion as a sequence of extract and
143   /// insert operations.
144   InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index,
145                                       FixedVectorType *SubVTy) {
146     assert(VTy && SubVTy &&
147            "Can only insert subvectors into vectors");
148     int NumSubElts = SubVTy->getNumElements();
149     assert((!isa<FixedVectorType>(VTy) ||
150             (Index + NumSubElts) <=
151                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
152            "SK_InsertSubvector index out of range");
153 
154     InstructionCost Cost = 0;
155     // Subvector insertion cost is equal to the cost of extracting element from
156     // the source type plus the cost of inserting them into the result vector
157     // type.
158     for (int i = 0; i != NumSubElts; ++i) {
159       Cost +=
160           thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i);
161       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
162                                           i + Index);
163     }
164     return Cost;
165   }
166 
167   /// Local query method delegates up to T which *must* implement this!
168   const TargetSubtargetInfo *getST() const {
169     return static_cast<const T *>(this)->getST();
170   }
171 
172   /// Local query method delegates up to T which *must* implement this!
173   const TargetLoweringBase *getTLI() const {
174     return static_cast<const T *>(this)->getTLI();
175   }
176 
177   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
178     switch (M) {
179       case TTI::MIM_Unindexed:
180         return ISD::UNINDEXED;
181       case TTI::MIM_PreInc:
182         return ISD::PRE_INC;
183       case TTI::MIM_PreDec:
184         return ISD::PRE_DEC;
185       case TTI::MIM_PostInc:
186         return ISD::POST_INC;
187       case TTI::MIM_PostDec:
188         return ISD::POST_DEC;
189     }
190     llvm_unreachable("Unexpected MemIndexedMode");
191   }
192 
193   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
194                                               Align Alignment,
195                                               bool VariableMask,
196                                               bool IsGatherScatter,
197                                               TTI::TargetCostKind CostKind) {
198     // We cannot scalarize scalable vectors, so return Invalid.
199     if (isa<ScalableVectorType>(DataTy))
200       return InstructionCost::getInvalid();
201 
202     auto *VT = cast<FixedVectorType>(DataTy);
203     // Assume the target does not have support for gather/scatter operations
204     // and provide a rough estimate.
205     //
206     // First, compute the cost of the individual memory operations.
207     InstructionCost AddrExtractCost =
208         IsGatherScatter
209             ? getVectorInstrCost(Instruction::ExtractElement,
210                                  FixedVectorType::get(
211                                      PointerType::get(VT->getElementType(), 0),
212                                      VT->getNumElements()),
213                                  -1)
214             : 0;
215     InstructionCost LoadCost =
216         VT->getNumElements() *
217         (AddrExtractCost +
218          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
219 
220     // Next, compute the cost of packing the result in a vector.
221     InstructionCost PackingCost = getScalarizationOverhead(
222         VT, Opcode != Instruction::Store, Opcode == Instruction::Store);
223 
224     InstructionCost ConditionalCost = 0;
225     if (VariableMask) {
226       // Compute the cost of conditionally executing the memory operations with
227       // variable masks. This includes extracting the individual conditions, a
228       // branches and PHIs to combine the results.
229       // NOTE: Estimating the cost of conditionally executing the memory
230       // operations accurately is quite difficult and the current solution
231       // provides a very rough estimate only.
232       ConditionalCost =
233           VT->getNumElements() *
234           (getVectorInstrCost(
235                Instruction::ExtractElement,
236                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
237                                     VT->getNumElements()),
238                -1) +
239            getCFInstrCost(Instruction::Br, CostKind) +
240            getCFInstrCost(Instruction::PHI, CostKind));
241     }
242 
243     return LoadCost + PackingCost + ConditionalCost;
244   }
245 
246 protected:
247   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
248       : BaseT(DL) {}
249   virtual ~BasicTTIImplBase() = default;
250 
251   using TargetTransformInfoImplBase::DL;
252 
253 public:
254   /// \name Scalar TTI Implementations
255   /// @{
256   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
257                                       unsigned AddressSpace, Align Alignment,
258                                       bool *Fast) const {
259     EVT E = EVT::getIntegerVT(Context, BitWidth);
260     return getTLI()->allowsMisalignedMemoryAccesses(
261         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
262   }
263 
264   bool hasBranchDivergence() { return false; }
265 
266   bool useGPUDivergenceAnalysis() { return false; }
267 
268   bool isSourceOfDivergence(const Value *V) { return false; }
269 
270   bool isAlwaysUniform(const Value *V) { return false; }
271 
272   unsigned getFlatAddressSpace() {
273     // Return an invalid address space.
274     return -1;
275   }
276 
277   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
278                                   Intrinsic::ID IID) const {
279     return false;
280   }
281 
282   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
283     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
284   }
285 
286   unsigned getAssumedAddrSpace(const Value *V) const {
287     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
288   }
289 
290   std::pair<const Value *, unsigned>
291   getPredicatedAddrSpace(const Value *V) const {
292     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
293   }
294 
295   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
296                                           Value *NewV) const {
297     return nullptr;
298   }
299 
300   bool isLegalAddImmediate(int64_t imm) {
301     return getTLI()->isLegalAddImmediate(imm);
302   }
303 
304   bool isLegalICmpImmediate(int64_t imm) {
305     return getTLI()->isLegalICmpImmediate(imm);
306   }
307 
308   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
309                              bool HasBaseReg, int64_t Scale,
310                              unsigned AddrSpace, Instruction *I = nullptr) {
311     TargetLoweringBase::AddrMode AM;
312     AM.BaseGV = BaseGV;
313     AM.BaseOffs = BaseOffset;
314     AM.HasBaseReg = HasBaseReg;
315     AM.Scale = Scale;
316     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
317   }
318 
319   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
320                              Type *ScalarValTy) const {
321     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
322       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
323       EVT VT = getTLI()->getValueType(DL, SrcTy);
324       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
325           getTLI()->isOperationCustom(ISD::STORE, VT))
326         return true;
327 
328       EVT ValVT =
329           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
330       EVT LegalizedVT =
331           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
332       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
333     };
334     while (VF > 2 && IsSupportedByTarget(VF))
335       VF /= 2;
336     return VF;
337   }
338 
339   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
340                           const DataLayout &DL) const {
341     EVT VT = getTLI()->getValueType(DL, Ty);
342     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
343   }
344 
345   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
346                            const DataLayout &DL) const {
347     EVT VT = getTLI()->getValueType(DL, Ty);
348     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
349   }
350 
351   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
352     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
353   }
354 
355   bool isNumRegsMajorCostOfLSR() {
356     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
357   }
358 
359   bool isProfitableLSRChainElement(Instruction *I) {
360     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
361   }
362 
363   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
364                                        int64_t BaseOffset, bool HasBaseReg,
365                                        int64_t Scale, unsigned AddrSpace) {
366     TargetLoweringBase::AddrMode AM;
367     AM.BaseGV = BaseGV;
368     AM.BaseOffs = BaseOffset;
369     AM.HasBaseReg = HasBaseReg;
370     AM.Scale = Scale;
371     return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace);
372   }
373 
374   bool isTruncateFree(Type *Ty1, Type *Ty2) {
375     return getTLI()->isTruncateFree(Ty1, Ty2);
376   }
377 
378   bool isProfitableToHoist(Instruction *I) {
379     return getTLI()->isProfitableToHoist(I);
380   }
381 
382   bool useAA() const { return getST()->useAA(); }
383 
384   bool isTypeLegal(Type *Ty) {
385     EVT VT = getTLI()->getValueType(DL, Ty);
386     return getTLI()->isTypeLegal(VT);
387   }
388 
389   unsigned getRegUsageForType(Type *Ty) {
390     EVT ETy = getTLI()->getValueType(DL, Ty);
391     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
392   }
393 
394   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
395                              ArrayRef<const Value *> Operands,
396                              TTI::TargetCostKind CostKind) {
397     return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
398   }
399 
400   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
401                                             unsigned &JumpTableSize,
402                                             ProfileSummaryInfo *PSI,
403                                             BlockFrequencyInfo *BFI) {
404     /// Try to find the estimated number of clusters. Note that the number of
405     /// clusters identified in this function could be different from the actual
406     /// numbers found in lowering. This function ignore switches that are
407     /// lowered with a mix of jump table / bit test / BTree. This function was
408     /// initially intended to be used when estimating the cost of switch in
409     /// inline cost heuristic, but it's a generic cost model to be used in other
410     /// places (e.g., in loop unrolling).
411     unsigned N = SI.getNumCases();
412     const TargetLoweringBase *TLI = getTLI();
413     const DataLayout &DL = this->getDataLayout();
414 
415     JumpTableSize = 0;
416     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
417 
418     // Early exit if both a jump table and bit test are not allowed.
419     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
420       return N;
421 
422     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
423     APInt MinCaseVal = MaxCaseVal;
424     for (auto CI : SI.cases()) {
425       const APInt &CaseVal = CI.getCaseValue()->getValue();
426       if (CaseVal.sgt(MaxCaseVal))
427         MaxCaseVal = CaseVal;
428       if (CaseVal.slt(MinCaseVal))
429         MinCaseVal = CaseVal;
430     }
431 
432     // Check if suitable for a bit test
433     if (N <= DL.getIndexSizeInBits(0u)) {
434       SmallPtrSet<const BasicBlock *, 4> Dests;
435       for (auto I : SI.cases())
436         Dests.insert(I.getCaseSuccessor());
437 
438       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
439                                      DL))
440         return 1;
441     }
442 
443     // Check if suitable for a jump table.
444     if (IsJTAllowed) {
445       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
446         return N;
447       uint64_t Range =
448           (MaxCaseVal - MinCaseVal)
449               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
450       // Check whether a range of clusters is dense enough for a jump table
451       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
452         JumpTableSize = Range;
453         return 1;
454       }
455     }
456     return N;
457   }
458 
459   bool shouldBuildLookupTables() {
460     const TargetLoweringBase *TLI = getTLI();
461     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
462            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
463   }
464 
465   bool shouldBuildRelLookupTables() const {
466     const TargetMachine &TM = getTLI()->getTargetMachine();
467     // If non-PIC mode, do not generate a relative lookup table.
468     if (!TM.isPositionIndependent())
469       return false;
470 
471     /// Relative lookup table entries consist of 32-bit offsets.
472     /// Do not generate relative lookup tables for large code models
473     /// in 64-bit achitectures where 32-bit offsets might not be enough.
474     if (TM.getCodeModel() == CodeModel::Medium ||
475         TM.getCodeModel() == CodeModel::Large)
476       return false;
477 
478     Triple TargetTriple = TM.getTargetTriple();
479     if (!TargetTriple.isArch64Bit())
480       return false;
481 
482     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
483     // there.
484     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
485       return false;
486 
487     return true;
488   }
489 
490   bool haveFastSqrt(Type *Ty) {
491     const TargetLoweringBase *TLI = getTLI();
492     EVT VT = TLI->getValueType(DL, Ty);
493     return TLI->isTypeLegal(VT) &&
494            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
495   }
496 
497   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
498     return true;
499   }
500 
501   InstructionCost getFPOpCost(Type *Ty) {
502     // Check whether FADD is available, as a proxy for floating-point in
503     // general.
504     const TargetLoweringBase *TLI = getTLI();
505     EVT VT = TLI->getValueType(DL, Ty);
506     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
507       return TargetTransformInfo::TCC_Basic;
508     return TargetTransformInfo::TCC_Expensive;
509   }
510 
511   unsigned getInliningThresholdMultiplier() { return 1; }
512   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
513 
514   int getInlinerVectorBonusPercent() { return 150; }
515 
516   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
517                                TTI::UnrollingPreferences &UP,
518                                OptimizationRemarkEmitter *ORE) {
519     // This unrolling functionality is target independent, but to provide some
520     // motivation for its intended use, for x86:
521 
522     // According to the Intel 64 and IA-32 Architectures Optimization Reference
523     // Manual, Intel Core models and later have a loop stream detector (and
524     // associated uop queue) that can benefit from partial unrolling.
525     // The relevant requirements are:
526     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
527     //    taken, and none of them may be calls.
528     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
529 
530     // According to the Software Optimization Guide for AMD Family 15h
531     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
532     // and loop buffer which can benefit from partial unrolling.
533     // The relevant requirements are:
534     //  - The loop must have fewer than 16 branches
535     //  - The loop must have less than 40 uops in all executed loop branches
536 
537     // The number of taken branches in a loop is hard to estimate here, and
538     // benchmarking has revealed that it is better not to be conservative when
539     // estimating the branch count. As a result, we'll ignore the branch limits
540     // until someone finds a case where it matters in practice.
541 
542     unsigned MaxOps;
543     const TargetSubtargetInfo *ST = getST();
544     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
545       MaxOps = PartialUnrollingThreshold;
546     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
547       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
548     else
549       return;
550 
551     // Scan the loop: don't unroll loops with calls.
552     for (BasicBlock *BB : L->blocks()) {
553       for (Instruction &I : *BB) {
554         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
555           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
556             if (!thisT()->isLoweredToCall(F))
557               continue;
558           }
559 
560           if (ORE) {
561             ORE->emit([&]() {
562               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
563                                         L->getHeader())
564                      << "advising against unrolling the loop because it "
565                         "contains a "
566                      << ore::NV("Call", &I);
567             });
568           }
569           return;
570         }
571       }
572     }
573 
574     // Enable runtime and partial unrolling up to the specified size.
575     // Enable using trip count upper bound to unroll loops.
576     UP.Partial = UP.Runtime = UP.UpperBound = true;
577     UP.PartialThreshold = MaxOps;
578 
579     // Avoid unrolling when optimizing for size.
580     UP.OptSizeThreshold = 0;
581     UP.PartialOptSizeThreshold = 0;
582 
583     // Set number of instructions optimized when "back edge"
584     // becomes "fall through" to default value of 2.
585     UP.BEInsns = 2;
586   }
587 
588   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
589                              TTI::PeelingPreferences &PP) {
590     PP.PeelCount = 0;
591     PP.AllowPeeling = true;
592     PP.AllowLoopNestsPeeling = false;
593     PP.PeelProfiledIterations = true;
594   }
595 
596   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
597                                 AssumptionCache &AC,
598                                 TargetLibraryInfo *LibInfo,
599                                 HardwareLoopInfo &HWLoopInfo) {
600     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
601   }
602 
603   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
604                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
605                                    DominatorTree *DT,
606                                    LoopVectorizationLegality *LVL) {
607     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL);
608   }
609 
610   PredicationStyle emitGetActiveLaneMask() {
611     return BaseT::emitGetActiveLaneMask();
612   }
613 
614   Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
615                                                IntrinsicInst &II) {
616     return BaseT::instCombineIntrinsic(IC, II);
617   }
618 
619   Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC,
620                                                      IntrinsicInst &II,
621                                                      APInt DemandedMask,
622                                                      KnownBits &Known,
623                                                      bool &KnownBitsComputed) {
624     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
625                                                    KnownBitsComputed);
626   }
627 
628   Optional<Value *> simplifyDemandedVectorEltsIntrinsic(
629       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
630       APInt &UndefElts2, APInt &UndefElts3,
631       std::function<void(Instruction *, unsigned, APInt, APInt &)>
632           SimplifyAndSetOp) {
633     return BaseT::simplifyDemandedVectorEltsIntrinsic(
634         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
635         SimplifyAndSetOp);
636   }
637 
638   InstructionCost getInstructionLatency(const Instruction *I) {
639     if (isa<LoadInst>(I))
640       return getST()->getSchedModel().DefaultLoadLatency;
641 
642     return BaseT::getInstructionLatency(I);
643   }
644 
645   virtual Optional<unsigned>
646   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
647     return Optional<unsigned>(
648       getST()->getCacheSize(static_cast<unsigned>(Level)));
649   }
650 
651   virtual Optional<unsigned>
652   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
653     Optional<unsigned> TargetResult =
654         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
655 
656     if (TargetResult)
657       return TargetResult;
658 
659     return BaseT::getCacheAssociativity(Level);
660   }
661 
662   virtual unsigned getCacheLineSize() const {
663     return getST()->getCacheLineSize();
664   }
665 
666   virtual unsigned getPrefetchDistance() const {
667     return getST()->getPrefetchDistance();
668   }
669 
670   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
671                                         unsigned NumStridedMemAccesses,
672                                         unsigned NumPrefetches,
673                                         bool HasCall) const {
674     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
675                                          NumPrefetches, HasCall);
676   }
677 
678   virtual unsigned getMaxPrefetchIterationsAhead() const {
679     return getST()->getMaxPrefetchIterationsAhead();
680   }
681 
682   virtual bool enableWritePrefetching() const {
683     return getST()->enableWritePrefetching();
684   }
685 
686   /// @}
687 
688   /// \name Vector TTI Implementations
689   /// @{
690 
691   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
692     return TypeSize::getFixed(32);
693   }
694 
695   Optional<unsigned> getMaxVScale() const { return None; }
696   Optional<unsigned> getVScaleForTuning() const { return None; }
697 
698   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
699   /// are set if the demanded result elements need to be inserted and/or
700   /// extracted from vectors.
701   InstructionCost getScalarizationOverhead(VectorType *InTy,
702                                            const APInt &DemandedElts,
703                                            bool Insert, bool Extract) {
704     /// FIXME: a bitfield is not a reasonable abstraction for talking about
705     /// which elements are needed from a scalable vector
706     if (isa<ScalableVectorType>(InTy))
707       return InstructionCost::getInvalid();
708     auto *Ty = cast<FixedVectorType>(InTy);
709 
710     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
711            "Vector size mismatch");
712 
713     InstructionCost Cost = 0;
714 
715     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
716       if (!DemandedElts[i])
717         continue;
718       if (Insert)
719         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i);
720       if (Extract)
721         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
722     }
723 
724     return Cost;
725   }
726 
727   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
728   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
729                                            bool Extract) {
730     if (isa<ScalableVectorType>(InTy))
731       return InstructionCost::getInvalid();
732     auto *Ty = cast<FixedVectorType>(InTy);
733 
734     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
735     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
736   }
737 
738   /// Estimate the overhead of scalarizing an instructions unique
739   /// non-constant operands. The (potentially vector) types to use for each of
740   /// argument are passes via Tys.
741   InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
742                                                    ArrayRef<Type *> Tys) {
743     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
744 
745     InstructionCost Cost = 0;
746     SmallPtrSet<const Value*, 4> UniqueOperands;
747     for (int I = 0, E = Args.size(); I != E; I++) {
748       // Disregard things like metadata arguments.
749       const Value *A = Args[I];
750       Type *Ty = Tys[I];
751       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
752           !Ty->isPtrOrPtrVectorTy())
753         continue;
754 
755       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
756         if (auto *VecTy = dyn_cast<VectorType>(Ty))
757           Cost += getScalarizationOverhead(VecTy, false, true);
758       }
759     }
760 
761     return Cost;
762   }
763 
764   /// Estimate the overhead of scalarizing the inputs and outputs of an
765   /// instruction, with return type RetTy and arguments Args of type Tys. If
766   /// Args are unknown (empty), then the cost associated with one argument is
767   /// added as a heuristic.
768   InstructionCost getScalarizationOverhead(VectorType *RetTy,
769                                            ArrayRef<const Value *> Args,
770                                            ArrayRef<Type *> Tys) {
771     InstructionCost Cost = getScalarizationOverhead(RetTy, true, false);
772     if (!Args.empty())
773       Cost += getOperandsScalarizationOverhead(Args, Tys);
774     else
775       // When no information on arguments is provided, we add the cost
776       // associated with one argument as a heuristic.
777       Cost += getScalarizationOverhead(RetTy, false, true);
778 
779     return Cost;
780   }
781 
782   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
783 
784   InstructionCost getArithmeticInstrCost(
785       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
786       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
787       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
788       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
789       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
790       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
791       const Instruction *CxtI = nullptr) {
792     // Check if any of the operands are vector operands.
793     const TargetLoweringBase *TLI = getTLI();
794     int ISD = TLI->InstructionOpcodeToISD(Opcode);
795     assert(ISD && "Invalid opcode");
796 
797     // TODO: Handle more cost kinds.
798     if (CostKind != TTI::TCK_RecipThroughput)
799       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
800                                            Opd1Info, Opd2Info,
801                                            Opd1PropInfo, Opd2PropInfo,
802                                            Args, CxtI);
803 
804     std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
805 
806     bool IsFloat = Ty->isFPOrFPVectorTy();
807     // Assume that floating point arithmetic operations cost twice as much as
808     // integer operations.
809     InstructionCost OpCost = (IsFloat ? 2 : 1);
810 
811     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
812       // The operation is legal. Assume it costs 1.
813       // TODO: Once we have extract/insert subvector cost we need to use them.
814       return LT.first * OpCost;
815     }
816 
817     if (!TLI->isOperationExpand(ISD, LT.second)) {
818       // If the operation is custom lowered, then assume that the code is twice
819       // as expensive.
820       return LT.first * 2 * OpCost;
821     }
822 
823     // An 'Expand' of URem and SRem is special because it may default
824     // to expanding the operation into a sequence of sub-operations
825     // i.e. X % Y -> X-(X/Y)*Y.
826     if (ISD == ISD::UREM || ISD == ISD::SREM) {
827       bool IsSigned = ISD == ISD::SREM;
828       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
829                                         LT.second) ||
830           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
831                                         LT.second)) {
832         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
833         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
834             DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo,
835             Opd2PropInfo);
836         InstructionCost MulCost =
837             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
838         InstructionCost SubCost =
839             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
840         return DivCost + MulCost + SubCost;
841       }
842     }
843 
844     // We cannot scalarize scalable vectors, so return Invalid.
845     if (isa<ScalableVectorType>(Ty))
846       return InstructionCost::getInvalid();
847 
848     // Else, assume that we need to scalarize this op.
849     // TODO: If one of the types get legalized by splitting, handle this
850     // similarly to what getCastInstrCost() does.
851     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
852       InstructionCost Cost = thisT()->getArithmeticInstrCost(
853           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
854           Opd1PropInfo, Opd2PropInfo, Args, CxtI);
855       // Return the cost of multiple scalar invocation plus the cost of
856       // inserting and extracting the values.
857       SmallVector<Type *> Tys(Args.size(), Ty);
858       return getScalarizationOverhead(VTy, Args, Tys) +
859              VTy->getNumElements() * Cost;
860     }
861 
862     // We don't know anything about this scalar instruction.
863     return OpCost;
864   }
865 
866   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
867                                               ArrayRef<int> Mask) const {
868     int Limit = Mask.size() * 2;
869     if (Mask.empty() ||
870         // Extra check required by isSingleSourceMaskImpl function (called by
871         // ShuffleVectorInst::isSingleSourceMask).
872         any_of(Mask, [Limit](int I) { return I >= Limit; }))
873       return Kind;
874     switch (Kind) {
875     case TTI::SK_PermuteSingleSrc:
876       if (ShuffleVectorInst::isReverseMask(Mask))
877         return TTI::SK_Reverse;
878       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
879         return TTI::SK_Broadcast;
880       break;
881     case TTI::SK_PermuteTwoSrc:
882       if (ShuffleVectorInst::isSelectMask(Mask))
883         return TTI::SK_Select;
884       if (ShuffleVectorInst::isTransposeMask(Mask))
885         return TTI::SK_Transpose;
886       break;
887     case TTI::SK_Select:
888     case TTI::SK_Reverse:
889     case TTI::SK_Broadcast:
890     case TTI::SK_Transpose:
891     case TTI::SK_InsertSubvector:
892     case TTI::SK_ExtractSubvector:
893     case TTI::SK_Splice:
894       break;
895     }
896     return Kind;
897   }
898 
899   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
900                                  ArrayRef<int> Mask, int Index,
901                                  VectorType *SubTp,
902                                  ArrayRef<const Value *> Args = None) {
903 
904     switch (improveShuffleKindFromMask(Kind, Mask)) {
905     case TTI::SK_Broadcast:
906       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
907         return getBroadcastShuffleOverhead(FVT);
908       return InstructionCost::getInvalid();
909     case TTI::SK_Select:
910     case TTI::SK_Splice:
911     case TTI::SK_Reverse:
912     case TTI::SK_Transpose:
913     case TTI::SK_PermuteSingleSrc:
914     case TTI::SK_PermuteTwoSrc:
915       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
916         return getPermuteShuffleOverhead(FVT);
917       return InstructionCost::getInvalid();
918     case TTI::SK_ExtractSubvector:
919       return getExtractSubvectorOverhead(Tp, Index,
920                                          cast<FixedVectorType>(SubTp));
921     case TTI::SK_InsertSubvector:
922       return getInsertSubvectorOverhead(Tp, Index,
923                                         cast<FixedVectorType>(SubTp));
924     }
925     llvm_unreachable("Unknown TTI::ShuffleKind");
926   }
927 
928   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
929                                    TTI::CastContextHint CCH,
930                                    TTI::TargetCostKind CostKind,
931                                    const Instruction *I = nullptr) {
932     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
933       return 0;
934 
935     const TargetLoweringBase *TLI = getTLI();
936     int ISD = TLI->InstructionOpcodeToISD(Opcode);
937     assert(ISD && "Invalid opcode");
938     std::pair<InstructionCost, MVT> SrcLT =
939         TLI->getTypeLegalizationCost(DL, Src);
940     std::pair<InstructionCost, MVT> DstLT =
941         TLI->getTypeLegalizationCost(DL, Dst);
942 
943     TypeSize SrcSize = SrcLT.second.getSizeInBits();
944     TypeSize DstSize = DstLT.second.getSizeInBits();
945     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
946     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
947 
948     switch (Opcode) {
949     default:
950       break;
951     case Instruction::Trunc:
952       // Check for NOOP conversions.
953       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
954         return 0;
955       LLVM_FALLTHROUGH;
956     case Instruction::BitCast:
957       // Bitcast between types that are legalized to the same type are free and
958       // assume int to/from ptr of the same size is also free.
959       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
960           SrcSize == DstSize)
961         return 0;
962       break;
963     case Instruction::FPExt:
964       if (I && getTLI()->isExtFree(I))
965         return 0;
966       break;
967     case Instruction::ZExt:
968       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
969         return 0;
970       LLVM_FALLTHROUGH;
971     case Instruction::SExt:
972       if (I && getTLI()->isExtFree(I))
973         return 0;
974 
975       // If this is a zext/sext of a load, return 0 if the corresponding
976       // extending load exists on target and the result type is legal.
977       if (CCH == TTI::CastContextHint::Normal) {
978         EVT ExtVT = EVT::getEVT(Dst);
979         EVT LoadVT = EVT::getEVT(Src);
980         unsigned LType =
981           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
982         if (DstLT.first == SrcLT.first &&
983             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
984           return 0;
985       }
986       break;
987     case Instruction::AddrSpaceCast:
988       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
989                                    Dst->getPointerAddressSpace()))
990         return 0;
991       break;
992     }
993 
994     auto *SrcVTy = dyn_cast<VectorType>(Src);
995     auto *DstVTy = dyn_cast<VectorType>(Dst);
996 
997     // If the cast is marked as legal (or promote) then assume low cost.
998     if (SrcLT.first == DstLT.first &&
999         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1000       return SrcLT.first;
1001 
1002     // Handle scalar conversions.
1003     if (!SrcVTy && !DstVTy) {
1004       // Just check the op cost. If the operation is legal then assume it costs
1005       // 1.
1006       if (!TLI->isOperationExpand(ISD, DstLT.second))
1007         return 1;
1008 
1009       // Assume that illegal scalar instruction are expensive.
1010       return 4;
1011     }
1012 
1013     // Check vector-to-vector casts.
1014     if (DstVTy && SrcVTy) {
1015       // If the cast is between same-sized registers, then the check is simple.
1016       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1017 
1018         // Assume that Zext is done using AND.
1019         if (Opcode == Instruction::ZExt)
1020           return SrcLT.first;
1021 
1022         // Assume that sext is done using SHL and SRA.
1023         if (Opcode == Instruction::SExt)
1024           return SrcLT.first * 2;
1025 
1026         // Just check the op cost. If the operation is legal then assume it
1027         // costs
1028         // 1 and multiply by the type-legalization overhead.
1029         if (!TLI->isOperationExpand(ISD, DstLT.second))
1030           return SrcLT.first * 1;
1031       }
1032 
1033       // If we are legalizing by splitting, query the concrete TTI for the cost
1034       // of casting the original vector twice. We also need to factor in the
1035       // cost of the split itself. Count that as 1, to be consistent with
1036       // TLI->getTypeLegalizationCost().
1037       bool SplitSrc =
1038           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1039           TargetLowering::TypeSplitVector;
1040       bool SplitDst =
1041           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1042           TargetLowering::TypeSplitVector;
1043       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1044           DstVTy->getElementCount().isVector()) {
1045         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1046         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1047         T *TTI = static_cast<T *>(this);
1048         // If both types need to be split then the split is free.
1049         InstructionCost SplitCost =
1050             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1051         return SplitCost +
1052                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1053                                           CostKind, I));
1054       }
1055 
1056       // Scalarization cost is Invalid, can't assume any num elements.
1057       if (isa<ScalableVectorType>(DstVTy))
1058         return InstructionCost::getInvalid();
1059 
1060       // In other cases where the source or destination are illegal, assume
1061       // the operation will get scalarized.
1062       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1063       InstructionCost Cost = thisT()->getCastInstrCost(
1064           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1065 
1066       // Return the cost of multiple scalar invocation plus the cost of
1067       // inserting and extracting the values.
1068       return getScalarizationOverhead(DstVTy, true, true) + Num * Cost;
1069     }
1070 
1071     // We already handled vector-to-vector and scalar-to-scalar conversions.
1072     // This
1073     // is where we handle bitcast between vectors and scalars. We need to assume
1074     //  that the conversion is scalarized in one way or another.
1075     if (Opcode == Instruction::BitCast) {
1076       // Illegal bitcasts are done by storing and loading from a stack slot.
1077       return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) +
1078              (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0);
1079     }
1080 
1081     llvm_unreachable("Unhandled cast");
1082   }
1083 
1084   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1085                                            VectorType *VecTy, unsigned Index) {
1086     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1087                                        Index) +
1088            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1089                                      TTI::CastContextHint::None,
1090                                      TTI::TCK_RecipThroughput);
1091   }
1092 
1093   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1094                                  const Instruction *I = nullptr) {
1095     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1096   }
1097 
1098   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1099                                      CmpInst::Predicate VecPred,
1100                                      TTI::TargetCostKind CostKind,
1101                                      const Instruction *I = nullptr) {
1102     const TargetLoweringBase *TLI = getTLI();
1103     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1104     assert(ISD && "Invalid opcode");
1105 
1106     // TODO: Handle other cost kinds.
1107     if (CostKind != TTI::TCK_RecipThroughput)
1108       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1109                                        I);
1110 
1111     // Selects on vectors are actually vector selects.
1112     if (ISD == ISD::SELECT) {
1113       assert(CondTy && "CondTy must exist");
1114       if (CondTy->isVectorTy())
1115         ISD = ISD::VSELECT;
1116     }
1117     std::pair<InstructionCost, MVT> LT =
1118         TLI->getTypeLegalizationCost(DL, ValTy);
1119 
1120     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1121         !TLI->isOperationExpand(ISD, LT.second)) {
1122       // The operation is legal. Assume it costs 1. Multiply
1123       // by the type-legalization overhead.
1124       return LT.first * 1;
1125     }
1126 
1127     // Otherwise, assume that the cast is scalarized.
1128     // TODO: If one of the types get legalized by splitting, handle this
1129     // similarly to what getCastInstrCost() does.
1130     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1131       if (isa<ScalableVectorType>(ValTy))
1132         return InstructionCost::getInvalid();
1133 
1134       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1135       if (CondTy)
1136         CondTy = CondTy->getScalarType();
1137       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1138           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1139 
1140       // Return the cost of multiple scalar invocation plus the cost of
1141       // inserting and extracting the values.
1142       return getScalarizationOverhead(ValVTy, true, false) + Num * Cost;
1143     }
1144 
1145     // Unknown scalar opcode.
1146     return 1;
1147   }
1148 
1149   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1150                                      unsigned Index) {
1151     std::pair<InstructionCost, MVT> LT =
1152         getTLI()->getTypeLegalizationCost(DL, Val->getScalarType());
1153 
1154     return LT.first;
1155   }
1156 
1157   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1158                                             int VF,
1159                                             const APInt &DemandedDstElts,
1160                                             TTI::TargetCostKind CostKind) {
1161     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1162            "Unexpected size of DemandedDstElts.");
1163 
1164     InstructionCost Cost;
1165 
1166     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1167     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1168 
1169     // The Mask shuffling cost is extract all the elements of the Mask
1170     // and insert each of them Factor times into the wide vector:
1171     //
1172     // E.g. an interleaved group with factor 3:
1173     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1174     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1175     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1176     // The cost is estimated as extract all mask elements from the <8xi1> mask
1177     // vector and insert them factor times into the <24xi1> shuffled mask
1178     // vector.
1179     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1180     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1181                                               /*Insert*/ false,
1182                                               /*Extract*/ true);
1183     Cost +=
1184         thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1185                                           /*Insert*/ true, /*Extract*/ false);
1186 
1187     return Cost;
1188   }
1189 
1190   InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src,
1191                                   MaybeAlign Alignment, unsigned AddressSpace,
1192                                   TTI::TargetCostKind CostKind,
1193                                   const Instruction *I = nullptr) {
1194     assert(!Src->isVoidTy() && "Invalid type");
1195     // Assume types, such as structs, are expensive.
1196     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1197       return 4;
1198     std::pair<InstructionCost, MVT> LT =
1199         getTLI()->getTypeLegalizationCost(DL, Src);
1200 
1201     // Assuming that all loads of legal types cost 1.
1202     InstructionCost Cost = LT.first;
1203     if (CostKind != TTI::TCK_RecipThroughput)
1204       return Cost;
1205 
1206     const DataLayout &DL = this->getDataLayout();
1207     if (Src->isVectorTy() &&
1208         // In practice it's not currently possible to have a change in lane
1209         // length for extending loads or truncating stores so both types should
1210         // have the same scalable property.
1211         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1212                             LT.second.getSizeInBits())) {
1213       // This is a vector load that legalizes to a larger type than the vector
1214       // itself. Unless the corresponding extending load or truncating store is
1215       // legal, then this will scalarize.
1216       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1217       EVT MemVT = getTLI()->getValueType(DL, Src);
1218       if (Opcode == Instruction::Store)
1219         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1220       else
1221         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1222 
1223       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1224         // This is a vector load/store for some illegal type that is scalarized.
1225         // We must account for the cost of building or decomposing the vector.
1226         Cost += getScalarizationOverhead(cast<VectorType>(Src),
1227                                          Opcode != Instruction::Store,
1228                                          Opcode == Instruction::Store);
1229       }
1230     }
1231 
1232     return Cost;
1233   }
1234 
1235   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1236                                         Align Alignment, unsigned AddressSpace,
1237                                         TTI::TargetCostKind CostKind) {
1238     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1239                                        CostKind);
1240   }
1241 
1242   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1243                                          const Value *Ptr, bool VariableMask,
1244                                          Align Alignment,
1245                                          TTI::TargetCostKind CostKind,
1246                                          const Instruction *I = nullptr) {
1247     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1248                                        true, CostKind);
1249   }
1250 
1251   InstructionCost getInterleavedMemoryOpCost(
1252       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1253       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1254       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1255 
1256     // We cannot scalarize scalable vectors, so return Invalid.
1257     if (isa<ScalableVectorType>(VecTy))
1258       return InstructionCost::getInvalid();
1259 
1260     auto *VT = cast<FixedVectorType>(VecTy);
1261 
1262     unsigned NumElts = VT->getNumElements();
1263     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1264 
1265     unsigned NumSubElts = NumElts / Factor;
1266     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1267 
1268     // Firstly, the cost of load/store operation.
1269     InstructionCost Cost;
1270     if (UseMaskForCond || UseMaskForGaps)
1271       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1272                                             AddressSpace, CostKind);
1273     else
1274       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1275                                       CostKind);
1276 
1277     // Legalize the vector type, and get the legalized and unlegalized type
1278     // sizes.
1279     MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second;
1280     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1281     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1282 
1283     // Scale the cost of the memory operation by the fraction of legalized
1284     // instructions that will actually be used. We shouldn't account for the
1285     // cost of dead instructions since they will be removed.
1286     //
1287     // E.g., An interleaved load of factor 8:
1288     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1289     //       %v0 = shufflevector %vec, undef, <0, 8>
1290     //
1291     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1292     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1293     // type). The other loads are unused.
1294     //
1295     // TODO: Note that legalization can turn masked loads/stores into unmasked
1296     // (legalized) loads/stores. This can be reflected in the cost.
1297     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1298       // The number of loads of a legal type it will take to represent a load
1299       // of the unlegalized vector type.
1300       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1301 
1302       // The number of elements of the unlegalized type that correspond to a
1303       // single legal instruction.
1304       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1305 
1306       // Determine which legal instructions will be used.
1307       BitVector UsedInsts(NumLegalInsts, false);
1308       for (unsigned Index : Indices)
1309         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1310           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1311 
1312       // Scale the cost of the load by the fraction of legal instructions that
1313       // will be used.
1314       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1315     }
1316 
1317     // Then plus the cost of interleave operation.
1318     assert(Indices.size() <= Factor &&
1319            "Interleaved memory op has too many members");
1320 
1321     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1322     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1323 
1324     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1325     for (unsigned Index : Indices) {
1326       assert(Index < Factor && "Invalid index for interleaved memory op");
1327       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1328         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1329     }
1330 
1331     if (Opcode == Instruction::Load) {
1332       // The interleave cost is similar to extract sub vectors' elements
1333       // from the wide vector, and insert them into sub vectors.
1334       //
1335       // E.g. An interleaved load of factor 2 (with one member of index 0):
1336       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1337       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1338       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1339       // <8 x i32> vector and insert them into a <4 x i32> vector.
1340       InstructionCost InsSubCost =
1341           thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
1342                                             /*Insert*/ true, /*Extract*/ false);
1343       Cost += Indices.size() * InsSubCost;
1344       Cost +=
1345           thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1346                                             /*Insert*/ false, /*Extract*/ true);
1347     } else {
1348       // The interleave cost is extract elements from sub vectors, and
1349       // insert them into the wide vector.
1350       //
1351       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1352       // (using VF=4):
1353       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1354       //    %gaps.mask = <true, true, false, true, true, false,
1355       //                  true, true, false, true, true, false>
1356       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1357       //                           i32 Align, <12 x i1> %gaps.mask
1358       // The cost is estimated as extract all elements (of actual members,
1359       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1360       // i32> vector.
1361       InstructionCost ExtSubCost =
1362           thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts,
1363                                             /*Insert*/ false, /*Extract*/ true);
1364       Cost += ExtSubCost * Indices.size();
1365       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1366                                                 /*Insert*/ true,
1367                                                 /*Extract*/ false);
1368     }
1369 
1370     if (!UseMaskForCond)
1371       return Cost;
1372 
1373     Type *I8Type = Type::getInt8Ty(VT->getContext());
1374 
1375     Cost += thisT()->getReplicationShuffleCost(
1376         I8Type, Factor, NumSubElts,
1377         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1378         CostKind);
1379 
1380     // The Gaps mask is invariant and created outside the loop, therefore the
1381     // cost of creating it is not accounted for here. However if we have both
1382     // a MaskForGaps and some other mask that guards the execution of the
1383     // memory access, we need to account for the cost of And-ing the two masks
1384     // inside the loop.
1385     if (UseMaskForGaps) {
1386       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1387       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1388                                               CostKind);
1389     }
1390 
1391     return Cost;
1392   }
1393 
1394   /// Get intrinsic cost based on arguments.
1395   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1396                                         TTI::TargetCostKind CostKind) {
1397     // Check for generically free intrinsics.
1398     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1399       return 0;
1400 
1401     // Assume that target intrinsics are cheap.
1402     Intrinsic::ID IID = ICA.getID();
1403     if (Function::isTargetIntrinsic(IID))
1404       return TargetTransformInfo::TCC_Basic;
1405 
1406     if (ICA.isTypeBasedOnly())
1407       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1408 
1409     Type *RetTy = ICA.getReturnType();
1410 
1411     ElementCount RetVF =
1412         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1413                              : ElementCount::getFixed(1));
1414     const IntrinsicInst *I = ICA.getInst();
1415     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1416     FastMathFlags FMF = ICA.getFlags();
1417     switch (IID) {
1418     default:
1419       break;
1420 
1421     case Intrinsic::powi:
1422       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1423         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1424         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1425                                                ShouldOptForSize)) {
1426           // The cost is modeled on the expansion performed by ExpandPowI in
1427           // SelectionDAGBuilder.
1428           APInt Exponent = RHSC->getValue().abs();
1429           unsigned ActiveBits = Exponent.getActiveBits();
1430           unsigned PopCount = Exponent.countPopulation();
1431           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1432                                  thisT()->getArithmeticInstrCost(
1433                                      Instruction::FMul, RetTy, CostKind);
1434           if (RHSC->getSExtValue() < 0)
1435             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1436                                                     CostKind);
1437           return Cost;
1438         }
1439       }
1440       break;
1441     case Intrinsic::cttz:
1442       // FIXME: If necessary, this should go in target-specific overrides.
1443       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz())
1444         return TargetTransformInfo::TCC_Basic;
1445       break;
1446 
1447     case Intrinsic::ctlz:
1448       // FIXME: If necessary, this should go in target-specific overrides.
1449       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz())
1450         return TargetTransformInfo::TCC_Basic;
1451       break;
1452 
1453     case Intrinsic::memcpy:
1454       return thisT()->getMemcpyCost(ICA.getInst());
1455 
1456     case Intrinsic::masked_scatter: {
1457       const Value *Mask = Args[3];
1458       bool VarMask = !isa<Constant>(Mask);
1459       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1460       return thisT()->getGatherScatterOpCost(Instruction::Store,
1461                                              ICA.getArgTypes()[0], Args[1],
1462                                              VarMask, Alignment, CostKind, I);
1463     }
1464     case Intrinsic::masked_gather: {
1465       const Value *Mask = Args[2];
1466       bool VarMask = !isa<Constant>(Mask);
1467       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1468       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1469                                              VarMask, Alignment, CostKind, I);
1470     }
1471     case Intrinsic::experimental_stepvector: {
1472       if (isa<ScalableVectorType>(RetTy))
1473         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1474       // The cost of materialising a constant integer vector.
1475       return TargetTransformInfo::TCC_Basic;
1476     }
1477     case Intrinsic::vector_extract: {
1478       // FIXME: Handle case where a scalable vector is extracted from a scalable
1479       // vector
1480       if (isa<ScalableVectorType>(RetTy))
1481         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1482       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1483       return thisT()->getShuffleCost(TTI::SK_ExtractSubvector,
1484                                      cast<VectorType>(Args[0]->getType()), None,
1485                                      Index, cast<VectorType>(RetTy));
1486     }
1487     case Intrinsic::vector_insert: {
1488       // FIXME: Handle case where a scalable vector is inserted into a scalable
1489       // vector
1490       if (isa<ScalableVectorType>(Args[1]->getType()))
1491         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1492       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1493       return thisT()->getShuffleCost(
1494           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None,
1495           Index, cast<VectorType>(Args[1]->getType()));
1496     }
1497     case Intrinsic::experimental_vector_reverse: {
1498       return thisT()->getShuffleCost(TTI::SK_Reverse,
1499                                      cast<VectorType>(Args[0]->getType()), None,
1500                                      0, cast<VectorType>(RetTy));
1501     }
1502     case Intrinsic::experimental_vector_splice: {
1503       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1504       return thisT()->getShuffleCost(TTI::SK_Splice,
1505                                      cast<VectorType>(Args[0]->getType()), None,
1506                                      Index, cast<VectorType>(RetTy));
1507     }
1508     case Intrinsic::vector_reduce_add:
1509     case Intrinsic::vector_reduce_mul:
1510     case Intrinsic::vector_reduce_and:
1511     case Intrinsic::vector_reduce_or:
1512     case Intrinsic::vector_reduce_xor:
1513     case Intrinsic::vector_reduce_smax:
1514     case Intrinsic::vector_reduce_smin:
1515     case Intrinsic::vector_reduce_fmax:
1516     case Intrinsic::vector_reduce_fmin:
1517     case Intrinsic::vector_reduce_umax:
1518     case Intrinsic::vector_reduce_umin: {
1519       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1520       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1521     }
1522     case Intrinsic::vector_reduce_fadd:
1523     case Intrinsic::vector_reduce_fmul: {
1524       IntrinsicCostAttributes Attrs(
1525           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1526       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1527     }
1528     case Intrinsic::fshl:
1529     case Intrinsic::fshr: {
1530       const Value *X = Args[0];
1531       const Value *Y = Args[1];
1532       const Value *Z = Args[2];
1533       TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW;
1534       TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX);
1535       TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY);
1536       TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ);
1537       TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue;
1538       OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1539                                                               : TTI::OP_None;
1540       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1541       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1542       InstructionCost Cost = 0;
1543       Cost +=
1544           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1545       Cost +=
1546           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1547       Cost += thisT()->getArithmeticInstrCost(
1548           BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX);
1549       Cost += thisT()->getArithmeticInstrCost(
1550           BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY);
1551       // Non-constant shift amounts requires a modulo.
1552       if (OpKindZ != TTI::OK_UniformConstantValue &&
1553           OpKindZ != TTI::OK_NonUniformConstantValue)
1554         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1555                                                 CostKind, OpKindZ, OpKindBW,
1556                                                 OpPropsZ, OpPropsBW);
1557       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1558       if (X != Y) {
1559         Type *CondTy = RetTy->getWithNewBitWidth(1);
1560         Cost +=
1561             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1562                                         CmpInst::ICMP_EQ, CostKind);
1563         Cost +=
1564             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1565                                         CmpInst::ICMP_EQ, CostKind);
1566       }
1567       return Cost;
1568     }
1569     case Intrinsic::get_active_lane_mask: {
1570       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1571       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1572 
1573       // If we're not expanding the intrinsic then we assume this is cheap
1574       // to implement.
1575       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1576         std::pair<InstructionCost, MVT> LT =
1577             getTLI()->getTypeLegalizationCost(DL, RetTy);
1578         return LT.first;
1579       }
1580 
1581       // Create the expanded types that will be used to calculate the uadd_sat
1582       // operation.
1583       Type *ExpRetTy = VectorType::get(
1584           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1585       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1586       InstructionCost Cost =
1587           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1588       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1589                                           CmpInst::ICMP_ULT, CostKind);
1590       return Cost;
1591     }
1592     }
1593 
1594     // Assume that we need to scalarize this intrinsic.
1595     // Compute the scalarization overhead based on Args for a vector
1596     // intrinsic.
1597     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1598     if (RetVF.isVector() && !RetVF.isScalable()) {
1599       ScalarizationCost = 0;
1600       if (!RetTy->isVoidTy())
1601         ScalarizationCost +=
1602             getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
1603       ScalarizationCost +=
1604           getOperandsScalarizationOverhead(Args, ICA.getArgTypes());
1605     }
1606 
1607     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1608                                   ScalarizationCost);
1609     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1610   }
1611 
1612   /// Get intrinsic cost based on argument types.
1613   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1614   /// cost of scalarizing the arguments and the return value will be computed
1615   /// based on types.
1616   InstructionCost
1617   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1618                                  TTI::TargetCostKind CostKind) {
1619     Intrinsic::ID IID = ICA.getID();
1620     Type *RetTy = ICA.getReturnType();
1621     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1622     FastMathFlags FMF = ICA.getFlags();
1623     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1624     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1625 
1626     VectorType *VecOpTy = nullptr;
1627     if (!Tys.empty()) {
1628       // The vector reduction operand is operand 0 except for fadd/fmul.
1629       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1630       unsigned VecTyIndex = 0;
1631       if (IID == Intrinsic::vector_reduce_fadd ||
1632           IID == Intrinsic::vector_reduce_fmul)
1633         VecTyIndex = 1;
1634       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1635       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1636     }
1637 
1638     // Library call cost - other than size, make it expensive.
1639     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1640     unsigned ISD = 0;
1641     switch (IID) {
1642     default: {
1643       // Scalable vectors cannot be scalarized, so return Invalid.
1644       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1645             return isa<ScalableVectorType>(Ty);
1646           }))
1647         return InstructionCost::getInvalid();
1648 
1649       // Assume that we need to scalarize this intrinsic.
1650       InstructionCost ScalarizationCost =
1651           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1652       unsigned ScalarCalls = 1;
1653       Type *ScalarRetTy = RetTy;
1654       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1655         if (!SkipScalarizationCost)
1656           ScalarizationCost = getScalarizationOverhead(RetVTy, true, false);
1657         ScalarCalls = std::max(ScalarCalls,
1658                                cast<FixedVectorType>(RetVTy)->getNumElements());
1659         ScalarRetTy = RetTy->getScalarType();
1660       }
1661       SmallVector<Type *, 4> ScalarTys;
1662       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1663         Type *Ty = Tys[i];
1664         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1665           if (!SkipScalarizationCost)
1666             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
1667           ScalarCalls = std::max(ScalarCalls,
1668                                  cast<FixedVectorType>(VTy)->getNumElements());
1669           Ty = Ty->getScalarType();
1670         }
1671         ScalarTys.push_back(Ty);
1672       }
1673       if (ScalarCalls == 1)
1674         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1675 
1676       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1677       InstructionCost ScalarCost =
1678           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1679 
1680       return ScalarCalls * ScalarCost + ScalarizationCost;
1681     }
1682     // Look for intrinsics that can be lowered directly or turned into a scalar
1683     // intrinsic call.
1684     case Intrinsic::sqrt:
1685       ISD = ISD::FSQRT;
1686       break;
1687     case Intrinsic::sin:
1688       ISD = ISD::FSIN;
1689       break;
1690     case Intrinsic::cos:
1691       ISD = ISD::FCOS;
1692       break;
1693     case Intrinsic::exp:
1694       ISD = ISD::FEXP;
1695       break;
1696     case Intrinsic::exp2:
1697       ISD = ISD::FEXP2;
1698       break;
1699     case Intrinsic::log:
1700       ISD = ISD::FLOG;
1701       break;
1702     case Intrinsic::log10:
1703       ISD = ISD::FLOG10;
1704       break;
1705     case Intrinsic::log2:
1706       ISD = ISD::FLOG2;
1707       break;
1708     case Intrinsic::fabs:
1709       ISD = ISD::FABS;
1710       break;
1711     case Intrinsic::canonicalize:
1712       ISD = ISD::FCANONICALIZE;
1713       break;
1714     case Intrinsic::minnum:
1715       ISD = ISD::FMINNUM;
1716       break;
1717     case Intrinsic::maxnum:
1718       ISD = ISD::FMAXNUM;
1719       break;
1720     case Intrinsic::minimum:
1721       ISD = ISD::FMINIMUM;
1722       break;
1723     case Intrinsic::maximum:
1724       ISD = ISD::FMAXIMUM;
1725       break;
1726     case Intrinsic::copysign:
1727       ISD = ISD::FCOPYSIGN;
1728       break;
1729     case Intrinsic::floor:
1730       ISD = ISD::FFLOOR;
1731       break;
1732     case Intrinsic::ceil:
1733       ISD = ISD::FCEIL;
1734       break;
1735     case Intrinsic::trunc:
1736       ISD = ISD::FTRUNC;
1737       break;
1738     case Intrinsic::nearbyint:
1739       ISD = ISD::FNEARBYINT;
1740       break;
1741     case Intrinsic::rint:
1742       ISD = ISD::FRINT;
1743       break;
1744     case Intrinsic::round:
1745       ISD = ISD::FROUND;
1746       break;
1747     case Intrinsic::roundeven:
1748       ISD = ISD::FROUNDEVEN;
1749       break;
1750     case Intrinsic::pow:
1751       ISD = ISD::FPOW;
1752       break;
1753     case Intrinsic::fma:
1754       ISD = ISD::FMA;
1755       break;
1756     case Intrinsic::fmuladd:
1757       ISD = ISD::FMA;
1758       break;
1759     case Intrinsic::experimental_constrained_fmuladd:
1760       ISD = ISD::STRICT_FMA;
1761       break;
1762     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1763     case Intrinsic::lifetime_start:
1764     case Intrinsic::lifetime_end:
1765     case Intrinsic::sideeffect:
1766     case Intrinsic::pseudoprobe:
1767     case Intrinsic::arithmetic_fence:
1768       return 0;
1769     case Intrinsic::masked_store: {
1770       Type *Ty = Tys[0];
1771       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1772       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1773                                             CostKind);
1774     }
1775     case Intrinsic::masked_load: {
1776       Type *Ty = RetTy;
1777       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1778       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1779                                             CostKind);
1780     }
1781     case Intrinsic::vector_reduce_add:
1782       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1783                                                  None, CostKind);
1784     case Intrinsic::vector_reduce_mul:
1785       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1786                                                  None, CostKind);
1787     case Intrinsic::vector_reduce_and:
1788       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1789                                                  None, CostKind);
1790     case Intrinsic::vector_reduce_or:
1791       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None,
1792                                                  CostKind);
1793     case Intrinsic::vector_reduce_xor:
1794       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1795                                                  None, CostKind);
1796     case Intrinsic::vector_reduce_fadd:
1797       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1798                                                  FMF, CostKind);
1799     case Intrinsic::vector_reduce_fmul:
1800       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1801                                                  FMF, CostKind);
1802     case Intrinsic::vector_reduce_smax:
1803     case Intrinsic::vector_reduce_smin:
1804     case Intrinsic::vector_reduce_fmax:
1805     case Intrinsic::vector_reduce_fmin:
1806       return thisT()->getMinMaxReductionCost(
1807           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1808           /*IsUnsigned=*/false, CostKind);
1809     case Intrinsic::vector_reduce_umax:
1810     case Intrinsic::vector_reduce_umin:
1811       return thisT()->getMinMaxReductionCost(
1812           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1813           /*IsUnsigned=*/true, CostKind);
1814     case Intrinsic::abs: {
1815       // abs(X) = select(icmp(X,0),X,sub(0,X))
1816       Type *CondTy = RetTy->getWithNewBitWidth(1);
1817       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1818       InstructionCost Cost = 0;
1819       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1820                                           Pred, CostKind);
1821       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1822                                           Pred, CostKind);
1823       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1824       Cost += thisT()->getArithmeticInstrCost(
1825           BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue);
1826       return Cost;
1827     }
1828     case Intrinsic::smax:
1829     case Intrinsic::smin:
1830     case Intrinsic::umax:
1831     case Intrinsic::umin: {
1832       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1833       Type *CondTy = RetTy->getWithNewBitWidth(1);
1834       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1835       CmpInst::Predicate Pred =
1836           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1837       InstructionCost Cost = 0;
1838       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1839                                           Pred, CostKind);
1840       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1841                                           Pred, CostKind);
1842       return Cost;
1843     }
1844     case Intrinsic::sadd_sat:
1845     case Intrinsic::ssub_sat: {
1846       Type *CondTy = RetTy->getWithNewBitWidth(1);
1847 
1848       Type *OpTy = StructType::create({RetTy, CondTy});
1849       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1850                                      ? Intrinsic::sadd_with_overflow
1851                                      : Intrinsic::ssub_with_overflow;
1852       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1853 
1854       // SatMax -> Overflow && SumDiff < 0
1855       // SatMin -> Overflow && SumDiff >= 0
1856       InstructionCost Cost = 0;
1857       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1858                                     nullptr, ScalarizationCostPassed);
1859       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1860       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1861                                           Pred, CostKind);
1862       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1863                                               CondTy, Pred, CostKind);
1864       return Cost;
1865     }
1866     case Intrinsic::uadd_sat:
1867     case Intrinsic::usub_sat: {
1868       Type *CondTy = RetTy->getWithNewBitWidth(1);
1869 
1870       Type *OpTy = StructType::create({RetTy, CondTy});
1871       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1872                                      ? Intrinsic::uadd_with_overflow
1873                                      : Intrinsic::usub_with_overflow;
1874 
1875       InstructionCost Cost = 0;
1876       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1877                                     nullptr, ScalarizationCostPassed);
1878       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1879       Cost +=
1880           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1881                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1882       return Cost;
1883     }
1884     case Intrinsic::smul_fix:
1885     case Intrinsic::umul_fix: {
1886       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1887       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1888 
1889       unsigned ExtOp =
1890           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1891       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1892 
1893       InstructionCost Cost = 0;
1894       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1895       Cost +=
1896           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1897       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1898                                             CCH, CostKind);
1899       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1900                                               CostKind, TTI::OK_AnyValue,
1901                                               TTI::OK_UniformConstantValue);
1902       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1903                                               TTI::OK_AnyValue,
1904                                               TTI::OK_UniformConstantValue);
1905       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1906       return Cost;
1907     }
1908     case Intrinsic::sadd_with_overflow:
1909     case Intrinsic::ssub_with_overflow: {
1910       Type *SumTy = RetTy->getContainedType(0);
1911       Type *OverflowTy = RetTy->getContainedType(1);
1912       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1913                             ? BinaryOperator::Add
1914                             : BinaryOperator::Sub;
1915 
1916       //   Add:
1917       //   Overflow -> (Result < LHS) ^ (RHS < 0)
1918       //   Sub:
1919       //   Overflow -> (Result < LHS) ^ (RHS > 0)
1920       InstructionCost Cost = 0;
1921       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1922       Cost += 2 * thisT()->getCmpSelInstrCost(
1923                       Instruction::ICmp, SumTy, OverflowTy,
1924                       CmpInst::ICMP_SGT, CostKind);
1925       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
1926                                               CostKind);
1927       return Cost;
1928     }
1929     case Intrinsic::uadd_with_overflow:
1930     case Intrinsic::usub_with_overflow: {
1931       Type *SumTy = RetTy->getContainedType(0);
1932       Type *OverflowTy = RetTy->getContainedType(1);
1933       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
1934                             ? BinaryOperator::Add
1935                             : BinaryOperator::Sub;
1936       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
1937                                     ? CmpInst::ICMP_ULT
1938                                     : CmpInst::ICMP_UGT;
1939 
1940       InstructionCost Cost = 0;
1941       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
1942       Cost +=
1943           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
1944                                       Pred, CostKind);
1945       return Cost;
1946     }
1947     case Intrinsic::smul_with_overflow:
1948     case Intrinsic::umul_with_overflow: {
1949       Type *MulTy = RetTy->getContainedType(0);
1950       Type *OverflowTy = RetTy->getContainedType(1);
1951       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
1952       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
1953       bool IsSigned = IID == Intrinsic::smul_with_overflow;
1954 
1955       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
1956       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1957 
1958       InstructionCost Cost = 0;
1959       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
1960       Cost +=
1961           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1962       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
1963                                             CCH, CostKind);
1964       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
1965                                               CostKind, TTI::OK_AnyValue,
1966                                               TTI::OK_UniformConstantValue);
1967 
1968       if (IsSigned)
1969         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
1970                                                 CostKind, TTI::OK_AnyValue,
1971                                                 TTI::OK_UniformConstantValue);
1972 
1973       Cost += thisT()->getCmpSelInstrCost(
1974           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
1975       return Cost;
1976     }
1977     case Intrinsic::fptosi_sat:
1978     case Intrinsic::fptoui_sat: {
1979       if (Tys.empty())
1980         break;
1981       Type *FromTy = Tys[0];
1982       bool IsSigned = IID == Intrinsic::fptosi_sat;
1983 
1984       InstructionCost Cost = 0;
1985       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
1986                                      {FromTy, FromTy});
1987       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
1988       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
1989                                      {FromTy, FromTy});
1990       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
1991       Cost += thisT()->getCastInstrCost(
1992           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
1993           TTI::CastContextHint::None, CostKind);
1994       if (IsSigned) {
1995         Type *CondTy = RetTy->getWithNewBitWidth(1);
1996         Cost += thisT()->getCmpSelInstrCost(
1997             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
1998         Cost += thisT()->getCmpSelInstrCost(
1999             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2000       }
2001       return Cost;
2002     }
2003     case Intrinsic::ctpop:
2004       ISD = ISD::CTPOP;
2005       // In case of legalization use TCC_Expensive. This is cheaper than a
2006       // library call but still not a cheap instruction.
2007       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2008       break;
2009     case Intrinsic::ctlz:
2010       ISD = ISD::CTLZ;
2011       break;
2012     case Intrinsic::cttz:
2013       ISD = ISD::CTTZ;
2014       break;
2015     case Intrinsic::bswap:
2016       ISD = ISD::BSWAP;
2017       break;
2018     case Intrinsic::bitreverse:
2019       ISD = ISD::BITREVERSE;
2020       break;
2021     }
2022 
2023     const TargetLoweringBase *TLI = getTLI();
2024     std::pair<InstructionCost, MVT> LT =
2025         TLI->getTypeLegalizationCost(DL, RetTy);
2026 
2027     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2028       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2029           TLI->isFAbsFree(LT.second)) {
2030         return 0;
2031       }
2032 
2033       // The operation is legal. Assume it costs 1.
2034       // If the type is split to multiple registers, assume that there is some
2035       // overhead to this.
2036       // TODO: Once we have extract/insert subvector cost we need to use them.
2037       if (LT.first > 1)
2038         return (LT.first * 2);
2039       else
2040         return (LT.first * 1);
2041     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2042       // If the operation is custom lowered then assume
2043       // that the code is twice as expensive.
2044       return (LT.first * 2);
2045     }
2046 
2047     // If we can't lower fmuladd into an FMA estimate the cost as a floating
2048     // point mul followed by an add.
2049     if (IID == Intrinsic::fmuladd)
2050       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2051                                              CostKind) +
2052              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2053                                              CostKind);
2054     if (IID == Intrinsic::experimental_constrained_fmuladd) {
2055       IntrinsicCostAttributes FMulAttrs(
2056         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2057       IntrinsicCostAttributes FAddAttrs(
2058         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2059       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2060              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2061     }
2062 
2063     // Else, assume that we need to scalarize this intrinsic. For math builtins
2064     // this will emit a costly libcall, adding call overhead and spills. Make it
2065     // very expensive.
2066     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2067       // Scalable vectors cannot be scalarized, so return Invalid.
2068       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2069             return isa<ScalableVectorType>(Ty);
2070           }))
2071         return InstructionCost::getInvalid();
2072 
2073       InstructionCost ScalarizationCost =
2074           SkipScalarizationCost ? ScalarizationCostPassed
2075                                 : getScalarizationOverhead(RetVTy, true, false);
2076 
2077       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2078       SmallVector<Type *, 4> ScalarTys;
2079       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2080         Type *Ty = Tys[i];
2081         if (Ty->isVectorTy())
2082           Ty = Ty->getScalarType();
2083         ScalarTys.push_back(Ty);
2084       }
2085       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2086       InstructionCost ScalarCost =
2087           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2088       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2089         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2090           if (!ICA.skipScalarizationCost())
2091             ScalarizationCost += getScalarizationOverhead(VTy, false, true);
2092           ScalarCalls = std::max(ScalarCalls,
2093                                  cast<FixedVectorType>(VTy)->getNumElements());
2094         }
2095       }
2096       return ScalarCalls * ScalarCost + ScalarizationCost;
2097     }
2098 
2099     // This is going to be turned into a library call, make it expensive.
2100     return SingleCallCost;
2101   }
2102 
2103   /// Compute a cost of the given call instruction.
2104   ///
2105   /// Compute the cost of calling function F with return type RetTy and
2106   /// argument types Tys. F might be nullptr, in this case the cost of an
2107   /// arbitrary call with the specified signature will be returned.
2108   /// This is used, for instance,  when we estimate call of a vector
2109   /// counterpart of the given function.
2110   /// \param F Called function, might be nullptr.
2111   /// \param RetTy Return value types.
2112   /// \param Tys Argument types.
2113   /// \returns The cost of Call instruction.
2114   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2115                                    ArrayRef<Type *> Tys,
2116                                    TTI::TargetCostKind CostKind) {
2117     return 10;
2118   }
2119 
2120   unsigned getNumberOfParts(Type *Tp) {
2121     std::pair<InstructionCost, MVT> LT =
2122         getTLI()->getTypeLegalizationCost(DL, Tp);
2123     return LT.first.isValid() ? *LT.first.getValue() : 0;
2124   }
2125 
2126   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2127                                             const SCEV *) {
2128     return 0;
2129   }
2130 
2131   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2132   /// We're assuming that reduction operation are performing the following way:
2133   ///
2134   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2135   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2136   ///            \----------------v-------------/  \----------v------------/
2137   ///                            n/2 elements               n/2 elements
2138   /// %red1 = op <n x t> %val, <n x t> val1
2139   /// After this operation we have a vector %red1 where only the first n/2
2140   /// elements are meaningful, the second n/2 elements are undefined and can be
2141   /// dropped. All other operations are actually working with the vector of
2142   /// length n/2, not n, though the real vector length is still n.
2143   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2144   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2145   ///            \----------------v-------------/  \----------v------------/
2146   ///                            n/4 elements               3*n/4 elements
2147   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2148   /// length n/2, the resulting vector has length n/4 etc.
2149   ///
2150   /// The cost model should take into account that the actual length of the
2151   /// vector is reduced on each iteration.
2152   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2153                                        TTI::TargetCostKind CostKind) {
2154     // Targets must implement a default value for the scalable case, since
2155     // we don't know how many lanes the vector has.
2156     if (isa<ScalableVectorType>(Ty))
2157       return InstructionCost::getInvalid();
2158 
2159     Type *ScalarTy = Ty->getElementType();
2160     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2161     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2162         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2163         NumVecElts >= 2) {
2164       // Or reduction for i1 is represented as:
2165       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2166       // %res = cmp ne iReduxWidth %val, 0
2167       // And reduction for i1 is represented as:
2168       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2169       // %res = cmp eq iReduxWidth %val, 11111
2170       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2171       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2172                                        TTI::CastContextHint::None, CostKind) +
2173              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2174                                          CmpInst::makeCmpResultType(ValTy),
2175                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2176     }
2177     unsigned NumReduxLevels = Log2_32(NumVecElts);
2178     InstructionCost ArithCost = 0;
2179     InstructionCost ShuffleCost = 0;
2180     std::pair<InstructionCost, MVT> LT =
2181         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2182     unsigned LongVectorCount = 0;
2183     unsigned MVTLen =
2184         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2185     while (NumVecElts > MVTLen) {
2186       NumVecElts /= 2;
2187       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2188       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2189                                              NumVecElts, SubTy);
2190       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2191       Ty = SubTy;
2192       ++LongVectorCount;
2193     }
2194 
2195     NumReduxLevels -= LongVectorCount;
2196 
2197     // The minimal length of the vector is limited by the real length of vector
2198     // operations performed on the current platform. That's why several final
2199     // reduction operations are performed on the vectors with the same
2200     // architecture-dependent length.
2201 
2202     // By default reductions need one shuffle per reduction level.
2203     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2204                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2205     ArithCost +=
2206         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2207     return ShuffleCost + ArithCost +
2208            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2209   }
2210 
2211   /// Try to calculate the cost of performing strict (in-order) reductions,
2212   /// which involves doing a sequence of floating point additions in lane
2213   /// order, starting with an initial value. For example, consider a scalar
2214   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2215   ///
2216   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2217   ///
2218   ///   %add1 = %InitVal + %v0
2219   ///   %add2 = %add1 + %v1
2220   ///   %add3 = %add2 + %v2
2221   ///   %add4 = %add3 + %v3
2222   ///
2223   /// As a simple estimate we can say the cost of such a reduction is 4 times
2224   /// the cost of a scalar FP addition. We can only estimate the costs for
2225   /// fixed-width vectors here because for scalable vectors we do not know the
2226   /// runtime number of operations.
2227   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2228                                           TTI::TargetCostKind CostKind) {
2229     // Targets must implement a default value for the scalable case, since
2230     // we don't know how many lanes the vector has.
2231     if (isa<ScalableVectorType>(Ty))
2232       return InstructionCost::getInvalid();
2233 
2234     auto *VTy = cast<FixedVectorType>(Ty);
2235     InstructionCost ExtractCost =
2236         getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true);
2237     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2238         Opcode, VTy->getElementType(), CostKind);
2239     ArithCost *= VTy->getNumElements();
2240 
2241     return ExtractCost + ArithCost;
2242   }
2243 
2244   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2245                                              Optional<FastMathFlags> FMF,
2246                                              TTI::TargetCostKind CostKind) {
2247     if (TTI::requiresOrderedReduction(FMF))
2248       return getOrderedReductionCost(Opcode, Ty, CostKind);
2249     return getTreeReductionCost(Opcode, Ty, CostKind);
2250   }
2251 
2252   /// Try to calculate op costs for min/max reduction operations.
2253   /// \param CondTy Conditional type for the Select instruction.
2254   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2255                                          bool IsUnsigned,
2256                                          TTI::TargetCostKind CostKind) {
2257     // Targets must implement a default value for the scalable case, since
2258     // we don't know how many lanes the vector has.
2259     if (isa<ScalableVectorType>(Ty))
2260       return InstructionCost::getInvalid();
2261 
2262     Type *ScalarTy = Ty->getElementType();
2263     Type *ScalarCondTy = CondTy->getElementType();
2264     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2265     unsigned NumReduxLevels = Log2_32(NumVecElts);
2266     unsigned CmpOpcode;
2267     if (Ty->isFPOrFPVectorTy()) {
2268       CmpOpcode = Instruction::FCmp;
2269     } else {
2270       assert(Ty->isIntOrIntVectorTy() &&
2271              "expecting floating point or integer type for min/max reduction");
2272       CmpOpcode = Instruction::ICmp;
2273     }
2274     InstructionCost MinMaxCost = 0;
2275     InstructionCost ShuffleCost = 0;
2276     std::pair<InstructionCost, MVT> LT =
2277         thisT()->getTLI()->getTypeLegalizationCost(DL, Ty);
2278     unsigned LongVectorCount = 0;
2279     unsigned MVTLen =
2280         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2281     while (NumVecElts > MVTLen) {
2282       NumVecElts /= 2;
2283       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2284       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2285 
2286       ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None,
2287                                              NumVecElts, SubTy);
2288       MinMaxCost +=
2289           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2290                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2291           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2292                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2293       Ty = SubTy;
2294       ++LongVectorCount;
2295     }
2296 
2297     NumReduxLevels -= LongVectorCount;
2298 
2299     // The minimal length of the vector is limited by the real length of vector
2300     // operations performed on the current platform. That's why several final
2301     // reduction opertions are perfomed on the vectors with the same
2302     // architecture-dependent length.
2303     ShuffleCost += NumReduxLevels * thisT()->getShuffleCost(
2304                                      TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty);
2305     MinMaxCost +=
2306         NumReduxLevels *
2307         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2308                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2309          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2310                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2311     // The last min/max should be in vector registers and we counted it above.
2312     // So just need a single extractelement.
2313     return ShuffleCost + MinMaxCost +
2314            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
2315   }
2316 
2317   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
2318                                               Type *ResTy, VectorType *Ty,
2319                                               TTI::TargetCostKind CostKind) {
2320     // Without any native support, this is equivalent to the cost of
2321     // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext))
2322     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2323     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2324         Instruction::Add, ExtTy, None, CostKind);
2325     InstructionCost MulCost = 0;
2326     InstructionCost ExtCost = thisT()->getCastInstrCost(
2327         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2328         TTI::CastContextHint::None, CostKind);
2329     if (IsMLA) {
2330       MulCost =
2331           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2332       ExtCost *= 2;
2333     }
2334 
2335     return RedCost + MulCost + ExtCost;
2336   }
2337 
2338   InstructionCost getVectorSplitCost() { return 1; }
2339 
2340   /// @}
2341 };
2342 
2343 /// Concrete BasicTTIImpl that can be used if no further customization
2344 /// is needed.
2345 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2346   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2347 
2348   friend class BasicTTIImplBase<BasicTTIImpl>;
2349 
2350   const TargetSubtargetInfo *ST;
2351   const TargetLoweringBase *TLI;
2352 
2353   const TargetSubtargetInfo *getST() const { return ST; }
2354   const TargetLoweringBase *getTLI() const { return TLI; }
2355 
2356 public:
2357   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2358 };
2359 
2360 } // end namespace llvm
2361 
2362 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2363