1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/TargetLowering.h"
30 #include "llvm/CodeGen/TargetSubtargetInfo.h"
31 #include "llvm/CodeGen/ValueTypes.h"
32 #include "llvm/IR/BasicBlock.h"
33 #include "llvm/IR/Constant.h"
34 #include "llvm/IR/Constants.h"
35 #include "llvm/IR/DataLayout.h"
36 #include "llvm/IR/DerivedTypes.h"
37 #include "llvm/IR/InstrTypes.h"
38 #include "llvm/IR/Instruction.h"
39 #include "llvm/IR/Instructions.h"
40 #include "llvm/IR/Intrinsics.h"
41 #include "llvm/IR/Operator.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/Value.h"
44 #include "llvm/Support/Casting.h"
45 #include "llvm/Support/CommandLine.h"
46 #include "llvm/Support/ErrorHandling.h"
47 #include "llvm/Support/MachineValueType.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include "llvm/Target/TargetOptions.h"
51 #include <algorithm>
52 #include <cassert>
53 #include <cstdint>
54 #include <limits>
55 #include <optional>
56 #include <utility>
57 
58 namespace llvm {
59 
60 class Function;
61 class GlobalValue;
62 class LLVMContext;
63 class ScalarEvolution;
64 class SCEV;
65 class TargetMachine;
66 
67 extern cl::opt<unsigned> PartialUnrollingThreshold;
68 
69 /// Base class which can be used to help build a TTI implementation.
70 ///
71 /// This class provides as much implementation of the TTI interface as is
72 /// possible using the target independent parts of the code generator.
73 ///
74 /// In order to subclass it, your class must implement a getST() method to
75 /// return the subtarget, and a getTLI() method to return the target lowering.
76 /// We need these methods implemented in the derived class so that this class
77 /// doesn't have to duplicate storage for them.
78 template <typename T>
79 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
80 private:
81   using BaseT = TargetTransformInfoImplCRTPBase<T>;
82   using TTI = TargetTransformInfo;
83 
84   /// Helper function to access this as a T.
85   T *thisT() { return static_cast<T *>(this); }
86 
87   /// Estimate a cost of Broadcast as an extract and sequence of insert
88   /// operations.
89   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
90                                               TTI::TargetCostKind CostKind) {
91     InstructionCost Cost = 0;
92     // Broadcast cost is equal to the cost of extracting the zero'th element
93     // plus the cost of inserting it into every element of the result vector.
94     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95                                         CostKind, 0, nullptr, nullptr);
96 
97     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99                                           CostKind, i, nullptr, nullptr);
100     }
101     return Cost;
102   }
103 
104   /// Estimate a cost of shuffle as a sequence of extract and insert
105   /// operations.
106   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
107                                             TTI::TargetCostKind CostKind) {
108     InstructionCost Cost = 0;
109     // Shuffle cost is equal to the cost of extracting element from its argument
110     // plus the cost of inserting them onto the result vector.
111 
112     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113     // index 0 of first vector, index 1 of second vector,index 2 of first
114     // vector and finally index 3 of second vector and insert them at index
115     // <0,1,2,3> of result vector.
116     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118                                           CostKind, i, nullptr, nullptr);
119       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120                                           CostKind, i, nullptr, nullptr);
121     }
122     return Cost;
123   }
124 
125   /// Estimate a cost of subvector extraction as a sequence of extract and
126   /// insert operations.
127   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
128                                               TTI::TargetCostKind CostKind,
129                                               int Index,
130                                               FixedVectorType *SubVTy) {
131     assert(VTy && SubVTy &&
132            "Can only extract subvectors from vectors");
133     int NumSubElts = SubVTy->getNumElements();
134     assert((!isa<FixedVectorType>(VTy) ||
135             (Index + NumSubElts) <=
136                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137            "SK_ExtractSubvector index out of range");
138 
139     InstructionCost Cost = 0;
140     // Subvector extraction cost is equal to the cost of extracting element from
141     // the source type plus the cost of inserting them into the result vector
142     // type.
143     for (int i = 0; i != NumSubElts; ++i) {
144       Cost +=
145           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146                                       CostKind, i + Index, nullptr, nullptr);
147       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148                                           CostKind, i, nullptr, nullptr);
149     }
150     return Cost;
151   }
152 
153   /// Estimate a cost of subvector insertion as a sequence of extract and
154   /// insert operations.
155   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
156                                              TTI::TargetCostKind CostKind,
157                                              int Index,
158                                              FixedVectorType *SubVTy) {
159     assert(VTy && SubVTy &&
160            "Can only insert subvectors into vectors");
161     int NumSubElts = SubVTy->getNumElements();
162     assert((!isa<FixedVectorType>(VTy) ||
163             (Index + NumSubElts) <=
164                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165            "SK_InsertSubvector index out of range");
166 
167     InstructionCost Cost = 0;
168     // Subvector insertion cost is equal to the cost of extracting element from
169     // the source type plus the cost of inserting them into the result vector
170     // type.
171     for (int i = 0; i != NumSubElts; ++i) {
172       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173                                           CostKind, i, nullptr, nullptr);
174       Cost +=
175           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176                                       i + Index, nullptr, nullptr);
177     }
178     return Cost;
179   }
180 
181   /// Local query method delegates up to T which *must* implement this!
182   const TargetSubtargetInfo *getST() const {
183     return static_cast<const T *>(this)->getST();
184   }
185 
186   /// Local query method delegates up to T which *must* implement this!
187   const TargetLoweringBase *getTLI() const {
188     return static_cast<const T *>(this)->getTLI();
189   }
190 
191   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192     switch (M) {
193       case TTI::MIM_Unindexed:
194         return ISD::UNINDEXED;
195       case TTI::MIM_PreInc:
196         return ISD::PRE_INC;
197       case TTI::MIM_PreDec:
198         return ISD::PRE_DEC;
199       case TTI::MIM_PostInc:
200         return ISD::POST_INC;
201       case TTI::MIM_PostDec:
202         return ISD::POST_DEC;
203     }
204     llvm_unreachable("Unexpected MemIndexedMode");
205   }
206 
207   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208                                               Align Alignment,
209                                               bool VariableMask,
210                                               bool IsGatherScatter,
211                                               TTI::TargetCostKind CostKind) {
212     // We cannot scalarize scalable vectors, so return Invalid.
213     if (isa<ScalableVectorType>(DataTy))
214       return InstructionCost::getInvalid();
215 
216     auto *VT = cast<FixedVectorType>(DataTy);
217     // Assume the target does not have support for gather/scatter operations
218     // and provide a rough estimate.
219     //
220     // First, compute the cost of the individual memory operations.
221     InstructionCost AddrExtractCost =
222         IsGatherScatter
223             ? getVectorInstrCost(Instruction::ExtractElement,
224                                  FixedVectorType::get(
225                                      PointerType::get(VT->getElementType(), 0),
226                                      VT->getNumElements()),
227                                  CostKind, -1, nullptr, nullptr)
228             : 0;
229     InstructionCost LoadCost =
230         VT->getNumElements() *
231         (AddrExtractCost +
232          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
233 
234     // Next, compute the cost of packing the result in a vector.
235     InstructionCost PackingCost =
236         getScalarizationOverhead(VT, Opcode != Instruction::Store,
237                                  Opcode == Instruction::Store, CostKind);
238 
239     InstructionCost ConditionalCost = 0;
240     if (VariableMask) {
241       // Compute the cost of conditionally executing the memory operations with
242       // variable masks. This includes extracting the individual conditions, a
243       // branches and PHIs to combine the results.
244       // NOTE: Estimating the cost of conditionally executing the memory
245       // operations accurately is quite difficult and the current solution
246       // provides a very rough estimate only.
247       ConditionalCost =
248           VT->getNumElements() *
249           (getVectorInstrCost(
250                Instruction::ExtractElement,
251                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
252                                     VT->getNumElements()),
253                CostKind, -1, nullptr, nullptr) +
254            getCFInstrCost(Instruction::Br, CostKind) +
255            getCFInstrCost(Instruction::PHI, CostKind));
256     }
257 
258     return LoadCost + PackingCost + ConditionalCost;
259   }
260 
261 protected:
262   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
263       : BaseT(DL) {}
264   virtual ~BasicTTIImplBase() = default;
265 
266   using TargetTransformInfoImplBase::DL;
267 
268 public:
269   /// \name Scalar TTI Implementations
270   /// @{
271   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
272                                       unsigned AddressSpace, Align Alignment,
273                                       unsigned *Fast) const {
274     EVT E = EVT::getIntegerVT(Context, BitWidth);
275     return getTLI()->allowsMisalignedMemoryAccesses(
276         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
277   }
278 
279   bool hasBranchDivergence() { return false; }
280 
281   bool useGPUDivergenceAnalysis() { return false; }
282 
283   bool isSourceOfDivergence(const Value *V) { return false; }
284 
285   bool isAlwaysUniform(const Value *V) { return false; }
286 
287   unsigned getFlatAddressSpace() {
288     // Return an invalid address space.
289     return -1;
290   }
291 
292   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
293                                   Intrinsic::ID IID) const {
294     return false;
295   }
296 
297   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
298     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
299   }
300 
301   unsigned getAssumedAddrSpace(const Value *V) const {
302     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
303   }
304 
305   bool isSingleThreaded() const {
306     return getTLI()->getTargetMachine().Options.ThreadModel ==
307            ThreadModel::Single;
308   }
309 
310   std::pair<const Value *, unsigned>
311   getPredicatedAddrSpace(const Value *V) const {
312     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
313   }
314 
315   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
316                                           Value *NewV) const {
317     return nullptr;
318   }
319 
320   bool isLegalAddImmediate(int64_t imm) {
321     return getTLI()->isLegalAddImmediate(imm);
322   }
323 
324   bool isLegalICmpImmediate(int64_t imm) {
325     return getTLI()->isLegalICmpImmediate(imm);
326   }
327 
328   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
329                              bool HasBaseReg, int64_t Scale,
330                              unsigned AddrSpace, Instruction *I = nullptr) {
331     TargetLoweringBase::AddrMode AM;
332     AM.BaseGV = BaseGV;
333     AM.BaseOffs = BaseOffset;
334     AM.HasBaseReg = HasBaseReg;
335     AM.Scale = Scale;
336     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
337   }
338 
339   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
340                              Type *ScalarValTy) const {
341     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
342       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
343       EVT VT = getTLI()->getValueType(DL, SrcTy);
344       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
345           getTLI()->isOperationCustom(ISD::STORE, VT))
346         return true;
347 
348       EVT ValVT =
349           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
350       EVT LegalizedVT =
351           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
352       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
353     };
354     while (VF > 2 && IsSupportedByTarget(VF))
355       VF /= 2;
356     return VF;
357   }
358 
359   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
360                           const DataLayout &DL) const {
361     EVT VT = getTLI()->getValueType(DL, Ty);
362     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
363   }
364 
365   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
366                            const DataLayout &DL) const {
367     EVT VT = getTLI()->getValueType(DL, Ty);
368     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
369   }
370 
371   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
372     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
373   }
374 
375   bool isNumRegsMajorCostOfLSR() {
376     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
377   }
378 
379   bool isProfitableLSRChainElement(Instruction *I) {
380     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
381   }
382 
383   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
384                                        int64_t BaseOffset, bool HasBaseReg,
385                                        int64_t Scale, unsigned AddrSpace) {
386     TargetLoweringBase::AddrMode AM;
387     AM.BaseGV = BaseGV;
388     AM.BaseOffs = BaseOffset;
389     AM.HasBaseReg = HasBaseReg;
390     AM.Scale = Scale;
391     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
392       return 0;
393     return -1;
394   }
395 
396   bool isTruncateFree(Type *Ty1, Type *Ty2) {
397     return getTLI()->isTruncateFree(Ty1, Ty2);
398   }
399 
400   bool isProfitableToHoist(Instruction *I) {
401     return getTLI()->isProfitableToHoist(I);
402   }
403 
404   bool useAA() const { return getST()->useAA(); }
405 
406   bool isTypeLegal(Type *Ty) {
407     EVT VT = getTLI()->getValueType(DL, Ty);
408     return getTLI()->isTypeLegal(VT);
409   }
410 
411   unsigned getRegUsageForType(Type *Ty) {
412     EVT ETy = getTLI()->getValueType(DL, Ty);
413     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
414   }
415 
416   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
417                              ArrayRef<const Value *> Operands,
418                              TTI::TargetCostKind CostKind) {
419     return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind);
420   }
421 
422   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
423                                             unsigned &JumpTableSize,
424                                             ProfileSummaryInfo *PSI,
425                                             BlockFrequencyInfo *BFI) {
426     /// Try to find the estimated number of clusters. Note that the number of
427     /// clusters identified in this function could be different from the actual
428     /// numbers found in lowering. This function ignore switches that are
429     /// lowered with a mix of jump table / bit test / BTree. This function was
430     /// initially intended to be used when estimating the cost of switch in
431     /// inline cost heuristic, but it's a generic cost model to be used in other
432     /// places (e.g., in loop unrolling).
433     unsigned N = SI.getNumCases();
434     const TargetLoweringBase *TLI = getTLI();
435     const DataLayout &DL = this->getDataLayout();
436 
437     JumpTableSize = 0;
438     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
439 
440     // Early exit if both a jump table and bit test are not allowed.
441     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
442       return N;
443 
444     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
445     APInt MinCaseVal = MaxCaseVal;
446     for (auto CI : SI.cases()) {
447       const APInt &CaseVal = CI.getCaseValue()->getValue();
448       if (CaseVal.sgt(MaxCaseVal))
449         MaxCaseVal = CaseVal;
450       if (CaseVal.slt(MinCaseVal))
451         MinCaseVal = CaseVal;
452     }
453 
454     // Check if suitable for a bit test
455     if (N <= DL.getIndexSizeInBits(0u)) {
456       SmallPtrSet<const BasicBlock *, 4> Dests;
457       for (auto I : SI.cases())
458         Dests.insert(I.getCaseSuccessor());
459 
460       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
461                                      DL))
462         return 1;
463     }
464 
465     // Check if suitable for a jump table.
466     if (IsJTAllowed) {
467       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
468         return N;
469       uint64_t Range =
470           (MaxCaseVal - MinCaseVal)
471               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
472       // Check whether a range of clusters is dense enough for a jump table
473       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
474         JumpTableSize = Range;
475         return 1;
476       }
477     }
478     return N;
479   }
480 
481   bool shouldBuildLookupTables() {
482     const TargetLoweringBase *TLI = getTLI();
483     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
484            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
485   }
486 
487   bool shouldBuildRelLookupTables() const {
488     const TargetMachine &TM = getTLI()->getTargetMachine();
489     // If non-PIC mode, do not generate a relative lookup table.
490     if (!TM.isPositionIndependent())
491       return false;
492 
493     /// Relative lookup table entries consist of 32-bit offsets.
494     /// Do not generate relative lookup tables for large code models
495     /// in 64-bit achitectures where 32-bit offsets might not be enough.
496     if (TM.getCodeModel() == CodeModel::Medium ||
497         TM.getCodeModel() == CodeModel::Large)
498       return false;
499 
500     Triple TargetTriple = TM.getTargetTriple();
501     if (!TargetTriple.isArch64Bit())
502       return false;
503 
504     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
505     // there.
506     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
507       return false;
508 
509     return true;
510   }
511 
512   bool haveFastSqrt(Type *Ty) {
513     const TargetLoweringBase *TLI = getTLI();
514     EVT VT = TLI->getValueType(DL, Ty);
515     return TLI->isTypeLegal(VT) &&
516            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
517   }
518 
519   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
520     return true;
521   }
522 
523   InstructionCost getFPOpCost(Type *Ty) {
524     // Check whether FADD is available, as a proxy for floating-point in
525     // general.
526     const TargetLoweringBase *TLI = getTLI();
527     EVT VT = TLI->getValueType(DL, Ty);
528     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
529       return TargetTransformInfo::TCC_Basic;
530     return TargetTransformInfo::TCC_Expensive;
531   }
532 
533   unsigned getInliningThresholdMultiplier() { return 1; }
534   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
535 
536   int getInlinerVectorBonusPercent() { return 150; }
537 
538   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
539                                TTI::UnrollingPreferences &UP,
540                                OptimizationRemarkEmitter *ORE) {
541     // This unrolling functionality is target independent, but to provide some
542     // motivation for its intended use, for x86:
543 
544     // According to the Intel 64 and IA-32 Architectures Optimization Reference
545     // Manual, Intel Core models and later have a loop stream detector (and
546     // associated uop queue) that can benefit from partial unrolling.
547     // The relevant requirements are:
548     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
549     //    taken, and none of them may be calls.
550     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
551 
552     // According to the Software Optimization Guide for AMD Family 15h
553     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
554     // and loop buffer which can benefit from partial unrolling.
555     // The relevant requirements are:
556     //  - The loop must have fewer than 16 branches
557     //  - The loop must have less than 40 uops in all executed loop branches
558 
559     // The number of taken branches in a loop is hard to estimate here, and
560     // benchmarking has revealed that it is better not to be conservative when
561     // estimating the branch count. As a result, we'll ignore the branch limits
562     // until someone finds a case where it matters in practice.
563 
564     unsigned MaxOps;
565     const TargetSubtargetInfo *ST = getST();
566     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
567       MaxOps = PartialUnrollingThreshold;
568     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
569       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
570     else
571       return;
572 
573     // Scan the loop: don't unroll loops with calls.
574     for (BasicBlock *BB : L->blocks()) {
575       for (Instruction &I : *BB) {
576         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
577           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
578             if (!thisT()->isLoweredToCall(F))
579               continue;
580           }
581 
582           if (ORE) {
583             ORE->emit([&]() {
584               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
585                                         L->getHeader())
586                      << "advising against unrolling the loop because it "
587                         "contains a "
588                      << ore::NV("Call", &I);
589             });
590           }
591           return;
592         }
593       }
594     }
595 
596     // Enable runtime and partial unrolling up to the specified size.
597     // Enable using trip count upper bound to unroll loops.
598     UP.Partial = UP.Runtime = UP.UpperBound = true;
599     UP.PartialThreshold = MaxOps;
600 
601     // Avoid unrolling when optimizing for size.
602     UP.OptSizeThreshold = 0;
603     UP.PartialOptSizeThreshold = 0;
604 
605     // Set number of instructions optimized when "back edge"
606     // becomes "fall through" to default value of 2.
607     UP.BEInsns = 2;
608   }
609 
610   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
611                              TTI::PeelingPreferences &PP) {
612     PP.PeelCount = 0;
613     PP.AllowPeeling = true;
614     PP.AllowLoopNestsPeeling = false;
615     PP.PeelProfiledIterations = true;
616   }
617 
618   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
619                                 AssumptionCache &AC,
620                                 TargetLibraryInfo *LibInfo,
621                                 HardwareLoopInfo &HWLoopInfo) {
622     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
623   }
624 
625   bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
626                                    AssumptionCache &AC, TargetLibraryInfo *TLI,
627                                    DominatorTree *DT,
628                                    LoopVectorizationLegality *LVL,
629                                    InterleavedAccessInfo *IAI) {
630     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL, IAI);
631   }
632 
633   PredicationStyle emitGetActiveLaneMask() {
634     return BaseT::emitGetActiveLaneMask();
635   }
636 
637   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
638                                                IntrinsicInst &II) {
639     return BaseT::instCombineIntrinsic(IC, II);
640   }
641 
642   std::optional<Value *>
643   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
644                                    APInt DemandedMask, KnownBits &Known,
645                                    bool &KnownBitsComputed) {
646     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
647                                                    KnownBitsComputed);
648   }
649 
650   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
651       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
652       APInt &UndefElts2, APInt &UndefElts3,
653       std::function<void(Instruction *, unsigned, APInt, APInt &)>
654           SimplifyAndSetOp) {
655     return BaseT::simplifyDemandedVectorEltsIntrinsic(
656         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
657         SimplifyAndSetOp);
658   }
659 
660   virtual std::optional<unsigned>
661   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
662     return std::optional<unsigned>(
663         getST()->getCacheSize(static_cast<unsigned>(Level)));
664   }
665 
666   virtual std::optional<unsigned>
667   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
668     std::optional<unsigned> TargetResult =
669         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
670 
671     if (TargetResult)
672       return TargetResult;
673 
674     return BaseT::getCacheAssociativity(Level);
675   }
676 
677   virtual unsigned getCacheLineSize() const {
678     return getST()->getCacheLineSize();
679   }
680 
681   virtual unsigned getPrefetchDistance() const {
682     return getST()->getPrefetchDistance();
683   }
684 
685   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
686                                         unsigned NumStridedMemAccesses,
687                                         unsigned NumPrefetches,
688                                         bool HasCall) const {
689     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
690                                          NumPrefetches, HasCall);
691   }
692 
693   virtual unsigned getMaxPrefetchIterationsAhead() const {
694     return getST()->getMaxPrefetchIterationsAhead();
695   }
696 
697   virtual bool enableWritePrefetching() const {
698     return getST()->enableWritePrefetching();
699   }
700 
701   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
702     return getST()->shouldPrefetchAddressSpace(AS);
703   }
704 
705   /// @}
706 
707   /// \name Vector TTI Implementations
708   /// @{
709 
710   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
711     return TypeSize::getFixed(32);
712   }
713 
714   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
715   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
716 
717   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
718   /// are set if the demanded result elements need to be inserted and/or
719   /// extracted from vectors.
720   InstructionCost getScalarizationOverhead(VectorType *InTy,
721                                            const APInt &DemandedElts,
722                                            bool Insert, bool Extract,
723                                            TTI::TargetCostKind CostKind) {
724     /// FIXME: a bitfield is not a reasonable abstraction for talking about
725     /// which elements are needed from a scalable vector
726     if (isa<ScalableVectorType>(InTy))
727       return InstructionCost::getInvalid();
728     auto *Ty = cast<FixedVectorType>(InTy);
729 
730     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
731            "Vector size mismatch");
732 
733     InstructionCost Cost = 0;
734 
735     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
736       if (!DemandedElts[i])
737         continue;
738       if (Insert)
739         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
740                                             CostKind, i, nullptr, nullptr);
741       if (Extract)
742         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
743                                             CostKind, i, nullptr, nullptr);
744     }
745 
746     return Cost;
747   }
748 
749   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
750   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
751                                            bool Extract,
752                                            TTI::TargetCostKind CostKind) {
753     if (isa<ScalableVectorType>(InTy))
754       return InstructionCost::getInvalid();
755     auto *Ty = cast<FixedVectorType>(InTy);
756 
757     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
758     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
759                                              CostKind);
760   }
761 
762   /// Estimate the overhead of scalarizing an instructions unique
763   /// non-constant operands. The (potentially vector) types to use for each of
764   /// argument are passes via Tys.
765   InstructionCost
766   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
767                                    ArrayRef<Type *> Tys,
768                                    TTI::TargetCostKind CostKind) {
769     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
770 
771     InstructionCost Cost = 0;
772     SmallPtrSet<const Value*, 4> UniqueOperands;
773     for (int I = 0, E = Args.size(); I != E; I++) {
774       // Disregard things like metadata arguments.
775       const Value *A = Args[I];
776       Type *Ty = Tys[I];
777       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
778           !Ty->isPtrOrPtrVectorTy())
779         continue;
780 
781       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
782         if (auto *VecTy = dyn_cast<VectorType>(Ty))
783           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
784                                            /*Extract*/ true, CostKind);
785       }
786     }
787 
788     return Cost;
789   }
790 
791   /// Estimate the overhead of scalarizing the inputs and outputs of an
792   /// instruction, with return type RetTy and arguments Args of type Tys. If
793   /// Args are unknown (empty), then the cost associated with one argument is
794   /// added as a heuristic.
795   InstructionCost getScalarizationOverhead(VectorType *RetTy,
796                                            ArrayRef<const Value *> Args,
797                                            ArrayRef<Type *> Tys,
798                                            TTI::TargetCostKind CostKind) {
799     InstructionCost Cost = getScalarizationOverhead(
800         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
801     if (!Args.empty())
802       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
803     else
804       // When no information on arguments is provided, we add the cost
805       // associated with one argument as a heuristic.
806       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
807                                        /*Extract*/ true, CostKind);
808 
809     return Cost;
810   }
811 
812   /// Estimate the cost of type-legalization and the legalized type.
813   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
814     LLVMContext &C = Ty->getContext();
815     EVT MTy = getTLI()->getValueType(DL, Ty);
816 
817     InstructionCost Cost = 1;
818     // We keep legalizing the type until we find a legal kind. We assume that
819     // the only operation that costs anything is the split. After splitting
820     // we need to handle two types.
821     while (true) {
822       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
823 
824       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
825         // Ensure we return a sensible simple VT here, since many callers of
826         // this function require it.
827         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
828         return std::make_pair(InstructionCost::getInvalid(), VT);
829       }
830 
831       if (LK.first == TargetLoweringBase::TypeLegal)
832         return std::make_pair(Cost, MTy.getSimpleVT());
833 
834       if (LK.first == TargetLoweringBase::TypeSplitVector ||
835           LK.first == TargetLoweringBase::TypeExpandInteger)
836         Cost *= 2;
837 
838       // Do not loop with f128 type.
839       if (MTy == LK.second)
840         return std::make_pair(Cost, MTy.getSimpleVT());
841 
842       // Keep legalizing the type.
843       MTy = LK.second;
844     }
845   }
846 
847   unsigned getMaxInterleaveFactor(unsigned VF) { return 1; }
848 
849   InstructionCost getArithmeticInstrCost(
850       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
851       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
852       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
853       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
854       const Instruction *CxtI = nullptr) {
855     // Check if any of the operands are vector operands.
856     const TargetLoweringBase *TLI = getTLI();
857     int ISD = TLI->InstructionOpcodeToISD(Opcode);
858     assert(ISD && "Invalid opcode");
859 
860     // TODO: Handle more cost kinds.
861     if (CostKind != TTI::TCK_RecipThroughput)
862       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
863                                            Opd1Info, Opd2Info,
864                                            Args, CxtI);
865 
866     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
867 
868     bool IsFloat = Ty->isFPOrFPVectorTy();
869     // Assume that floating point arithmetic operations cost twice as much as
870     // integer operations.
871     InstructionCost OpCost = (IsFloat ? 2 : 1);
872 
873     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
874       // The operation is legal. Assume it costs 1.
875       // TODO: Once we have extract/insert subvector cost we need to use them.
876       return LT.first * OpCost;
877     }
878 
879     if (!TLI->isOperationExpand(ISD, LT.second)) {
880       // If the operation is custom lowered, then assume that the code is twice
881       // as expensive.
882       return LT.first * 2 * OpCost;
883     }
884 
885     // An 'Expand' of URem and SRem is special because it may default
886     // to expanding the operation into a sequence of sub-operations
887     // i.e. X % Y -> X-(X/Y)*Y.
888     if (ISD == ISD::UREM || ISD == ISD::SREM) {
889       bool IsSigned = ISD == ISD::SREM;
890       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
891                                         LT.second) ||
892           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
893                                         LT.second)) {
894         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
895         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
896             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
897         InstructionCost MulCost =
898             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
899         InstructionCost SubCost =
900             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
901         return DivCost + MulCost + SubCost;
902       }
903     }
904 
905     // We cannot scalarize scalable vectors, so return Invalid.
906     if (isa<ScalableVectorType>(Ty))
907       return InstructionCost::getInvalid();
908 
909     // Else, assume that we need to scalarize this op.
910     // TODO: If one of the types get legalized by splitting, handle this
911     // similarly to what getCastInstrCost() does.
912     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
913       InstructionCost Cost = thisT()->getArithmeticInstrCost(
914           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
915           Args, CxtI);
916       // Return the cost of multiple scalar invocation plus the cost of
917       // inserting and extracting the values.
918       SmallVector<Type *> Tys(Args.size(), Ty);
919       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
920              VTy->getNumElements() * Cost;
921     }
922 
923     // We don't know anything about this scalar instruction.
924     return OpCost;
925   }
926 
927   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
928                                               ArrayRef<int> Mask) const {
929     int Limit = Mask.size() * 2;
930     if (Mask.empty() ||
931         // Extra check required by isSingleSourceMaskImpl function (called by
932         // ShuffleVectorInst::isSingleSourceMask).
933         any_of(Mask, [Limit](int I) { return I >= Limit; }))
934       return Kind;
935     int Index;
936     switch (Kind) {
937     case TTI::SK_PermuteSingleSrc:
938       if (ShuffleVectorInst::isReverseMask(Mask))
939         return TTI::SK_Reverse;
940       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
941         return TTI::SK_Broadcast;
942       break;
943     case TTI::SK_PermuteTwoSrc:
944       if (ShuffleVectorInst::isSelectMask(Mask))
945         return TTI::SK_Select;
946       if (ShuffleVectorInst::isTransposeMask(Mask))
947         return TTI::SK_Transpose;
948       if (ShuffleVectorInst::isSpliceMask(Mask, Index))
949         return TTI::SK_Splice;
950       break;
951     case TTI::SK_Select:
952     case TTI::SK_Reverse:
953     case TTI::SK_Broadcast:
954     case TTI::SK_Transpose:
955     case TTI::SK_InsertSubvector:
956     case TTI::SK_ExtractSubvector:
957     case TTI::SK_Splice:
958       break;
959     }
960     return Kind;
961   }
962 
963   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
964                                  ArrayRef<int> Mask,
965                                  TTI::TargetCostKind CostKind, int Index,
966                                  VectorType *SubTp,
967                                  ArrayRef<const Value *> Args = std::nullopt) {
968 
969     switch (improveShuffleKindFromMask(Kind, Mask)) {
970     case TTI::SK_Broadcast:
971       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
972         return getBroadcastShuffleOverhead(FVT, CostKind);
973       return InstructionCost::getInvalid();
974     case TTI::SK_Select:
975     case TTI::SK_Splice:
976     case TTI::SK_Reverse:
977     case TTI::SK_Transpose:
978     case TTI::SK_PermuteSingleSrc:
979     case TTI::SK_PermuteTwoSrc:
980       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
981         return getPermuteShuffleOverhead(FVT, CostKind);
982       return InstructionCost::getInvalid();
983     case TTI::SK_ExtractSubvector:
984       return getExtractSubvectorOverhead(Tp, CostKind, Index,
985                                          cast<FixedVectorType>(SubTp));
986     case TTI::SK_InsertSubvector:
987       return getInsertSubvectorOverhead(Tp, CostKind, Index,
988                                         cast<FixedVectorType>(SubTp));
989     }
990     llvm_unreachable("Unknown TTI::ShuffleKind");
991   }
992 
993   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
994                                    TTI::CastContextHint CCH,
995                                    TTI::TargetCostKind CostKind,
996                                    const Instruction *I = nullptr) {
997     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
998       return 0;
999 
1000     const TargetLoweringBase *TLI = getTLI();
1001     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1002     assert(ISD && "Invalid opcode");
1003     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1004     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1005 
1006     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1007     TypeSize DstSize = DstLT.second.getSizeInBits();
1008     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1009     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1010 
1011     switch (Opcode) {
1012     default:
1013       break;
1014     case Instruction::Trunc:
1015       // Check for NOOP conversions.
1016       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1017         return 0;
1018       [[fallthrough]];
1019     case Instruction::BitCast:
1020       // Bitcast between types that are legalized to the same type are free and
1021       // assume int to/from ptr of the same size is also free.
1022       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1023           SrcSize == DstSize)
1024         return 0;
1025       break;
1026     case Instruction::FPExt:
1027       if (I && getTLI()->isExtFree(I))
1028         return 0;
1029       break;
1030     case Instruction::ZExt:
1031       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1032         return 0;
1033       [[fallthrough]];
1034     case Instruction::SExt:
1035       if (I && getTLI()->isExtFree(I))
1036         return 0;
1037 
1038       // If this is a zext/sext of a load, return 0 if the corresponding
1039       // extending load exists on target and the result type is legal.
1040       if (CCH == TTI::CastContextHint::Normal) {
1041         EVT ExtVT = EVT::getEVT(Dst);
1042         EVT LoadVT = EVT::getEVT(Src);
1043         unsigned LType =
1044           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1045         if (DstLT.first == SrcLT.first &&
1046             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1047           return 0;
1048       }
1049       break;
1050     case Instruction::AddrSpaceCast:
1051       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1052                                    Dst->getPointerAddressSpace()))
1053         return 0;
1054       break;
1055     }
1056 
1057     auto *SrcVTy = dyn_cast<VectorType>(Src);
1058     auto *DstVTy = dyn_cast<VectorType>(Dst);
1059 
1060     // If the cast is marked as legal (or promote) then assume low cost.
1061     if (SrcLT.first == DstLT.first &&
1062         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1063       return SrcLT.first;
1064 
1065     // Handle scalar conversions.
1066     if (!SrcVTy && !DstVTy) {
1067       // Just check the op cost. If the operation is legal then assume it costs
1068       // 1.
1069       if (!TLI->isOperationExpand(ISD, DstLT.second))
1070         return 1;
1071 
1072       // Assume that illegal scalar instruction are expensive.
1073       return 4;
1074     }
1075 
1076     // Check vector-to-vector casts.
1077     if (DstVTy && SrcVTy) {
1078       // If the cast is between same-sized registers, then the check is simple.
1079       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1080 
1081         // Assume that Zext is done using AND.
1082         if (Opcode == Instruction::ZExt)
1083           return SrcLT.first;
1084 
1085         // Assume that sext is done using SHL and SRA.
1086         if (Opcode == Instruction::SExt)
1087           return SrcLT.first * 2;
1088 
1089         // Just check the op cost. If the operation is legal then assume it
1090         // costs
1091         // 1 and multiply by the type-legalization overhead.
1092         if (!TLI->isOperationExpand(ISD, DstLT.second))
1093           return SrcLT.first * 1;
1094       }
1095 
1096       // If we are legalizing by splitting, query the concrete TTI for the cost
1097       // of casting the original vector twice. We also need to factor in the
1098       // cost of the split itself. Count that as 1, to be consistent with
1099       // getTypeLegalizationCost().
1100       bool SplitSrc =
1101           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1102           TargetLowering::TypeSplitVector;
1103       bool SplitDst =
1104           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1105           TargetLowering::TypeSplitVector;
1106       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1107           DstVTy->getElementCount().isVector()) {
1108         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1109         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1110         T *TTI = static_cast<T *>(this);
1111         // If both types need to be split then the split is free.
1112         InstructionCost SplitCost =
1113             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1114         return SplitCost +
1115                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1116                                           CostKind, I));
1117       }
1118 
1119       // Scalarization cost is Invalid, can't assume any num elements.
1120       if (isa<ScalableVectorType>(DstVTy))
1121         return InstructionCost::getInvalid();
1122 
1123       // In other cases where the source or destination are illegal, assume
1124       // the operation will get scalarized.
1125       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1126       InstructionCost Cost = thisT()->getCastInstrCost(
1127           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1128 
1129       // Return the cost of multiple scalar invocation plus the cost of
1130       // inserting and extracting the values.
1131       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1132                                       CostKind) +
1133              Num * Cost;
1134     }
1135 
1136     // We already handled vector-to-vector and scalar-to-scalar conversions.
1137     // This
1138     // is where we handle bitcast between vectors and scalars. We need to assume
1139     //  that the conversion is scalarized in one way or another.
1140     if (Opcode == Instruction::BitCast) {
1141       // Illegal bitcasts are done by storing and loading from a stack slot.
1142       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1143                                                 /*Extract*/ true, CostKind)
1144                      : 0) +
1145              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1146                                                 /*Extract*/ false, CostKind)
1147                      : 0);
1148     }
1149 
1150     llvm_unreachable("Unhandled cast");
1151   }
1152 
1153   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1154                                            VectorType *VecTy, unsigned Index) {
1155     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1156     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1157                                        CostKind, Index, nullptr, nullptr) +
1158            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1159                                      TTI::CastContextHint::None, CostKind);
1160   }
1161 
1162   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1163                                  const Instruction *I = nullptr) {
1164     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1165   }
1166 
1167   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1168                                      CmpInst::Predicate VecPred,
1169                                      TTI::TargetCostKind CostKind,
1170                                      const Instruction *I = nullptr) {
1171     const TargetLoweringBase *TLI = getTLI();
1172     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1173     assert(ISD && "Invalid opcode");
1174 
1175     // TODO: Handle other cost kinds.
1176     if (CostKind != TTI::TCK_RecipThroughput)
1177       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1178                                        I);
1179 
1180     // Selects on vectors are actually vector selects.
1181     if (ISD == ISD::SELECT) {
1182       assert(CondTy && "CondTy must exist");
1183       if (CondTy->isVectorTy())
1184         ISD = ISD::VSELECT;
1185     }
1186     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1187 
1188     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1189         !TLI->isOperationExpand(ISD, LT.second)) {
1190       // The operation is legal. Assume it costs 1. Multiply
1191       // by the type-legalization overhead.
1192       return LT.first * 1;
1193     }
1194 
1195     // Otherwise, assume that the cast is scalarized.
1196     // TODO: If one of the types get legalized by splitting, handle this
1197     // similarly to what getCastInstrCost() does.
1198     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1199       if (isa<ScalableVectorType>(ValTy))
1200         return InstructionCost::getInvalid();
1201 
1202       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1203       if (CondTy)
1204         CondTy = CondTy->getScalarType();
1205       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1206           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1207 
1208       // Return the cost of multiple scalar invocation plus the cost of
1209       // inserting and extracting the values.
1210       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1211                                       /*Extract*/ false, CostKind) +
1212              Num * Cost;
1213     }
1214 
1215     // Unknown scalar opcode.
1216     return 1;
1217   }
1218 
1219   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1220                                      TTI::TargetCostKind CostKind,
1221                                      unsigned Index, Value *Op0, Value *Op1) {
1222     return getRegUsageForType(Val->getScalarType());
1223   }
1224 
1225   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1226                                      TTI::TargetCostKind CostKind,
1227                                      unsigned Index) {
1228     Value *Op0 = nullptr;
1229     Value *Op1 = nullptr;
1230     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1231       Op0 = IE->getOperand(0);
1232       Op1 = IE->getOperand(1);
1233     }
1234     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1235                                        Op1);
1236   }
1237 
1238   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1239                                             int VF,
1240                                             const APInt &DemandedDstElts,
1241                                             TTI::TargetCostKind CostKind) {
1242     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1243            "Unexpected size of DemandedDstElts.");
1244 
1245     InstructionCost Cost;
1246 
1247     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1248     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1249 
1250     // The Mask shuffling cost is extract all the elements of the Mask
1251     // and insert each of them Factor times into the wide vector:
1252     //
1253     // E.g. an interleaved group with factor 3:
1254     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1255     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1256     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1257     // The cost is estimated as extract all mask elements from the <8xi1> mask
1258     // vector and insert them factor times into the <24xi1> shuffled mask
1259     // vector.
1260     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1261     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1262                                               /*Insert*/ false,
1263                                               /*Extract*/ true, CostKind);
1264     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1265                                               /*Insert*/ true,
1266                                               /*Extract*/ false, CostKind);
1267 
1268     return Cost;
1269   }
1270 
1271   InstructionCost
1272   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1273                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1274                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1275                   const Instruction *I = nullptr) {
1276     assert(!Src->isVoidTy() && "Invalid type");
1277     // Assume types, such as structs, are expensive.
1278     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1279       return 4;
1280     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1281 
1282     // Assuming that all loads of legal types cost 1.
1283     InstructionCost Cost = LT.first;
1284     if (CostKind != TTI::TCK_RecipThroughput)
1285       return Cost;
1286 
1287     const DataLayout &DL = this->getDataLayout();
1288     if (Src->isVectorTy() &&
1289         // In practice it's not currently possible to have a change in lane
1290         // length for extending loads or truncating stores so both types should
1291         // have the same scalable property.
1292         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1293                             LT.second.getSizeInBits())) {
1294       // This is a vector load that legalizes to a larger type than the vector
1295       // itself. Unless the corresponding extending load or truncating store is
1296       // legal, then this will scalarize.
1297       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1298       EVT MemVT = getTLI()->getValueType(DL, Src);
1299       if (Opcode == Instruction::Store)
1300         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1301       else
1302         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1303 
1304       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1305         // This is a vector load/store for some illegal type that is scalarized.
1306         // We must account for the cost of building or decomposing the vector.
1307         Cost += getScalarizationOverhead(
1308             cast<VectorType>(Src), Opcode != Instruction::Store,
1309             Opcode == Instruction::Store, CostKind);
1310       }
1311     }
1312 
1313     return Cost;
1314   }
1315 
1316   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1317                                         Align Alignment, unsigned AddressSpace,
1318                                         TTI::TargetCostKind CostKind) {
1319     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1320                                        CostKind);
1321   }
1322 
1323   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1324                                          const Value *Ptr, bool VariableMask,
1325                                          Align Alignment,
1326                                          TTI::TargetCostKind CostKind,
1327                                          const Instruction *I = nullptr) {
1328     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1329                                        true, CostKind);
1330   }
1331 
1332   InstructionCost getInterleavedMemoryOpCost(
1333       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1334       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1335       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1336 
1337     // We cannot scalarize scalable vectors, so return Invalid.
1338     if (isa<ScalableVectorType>(VecTy))
1339       return InstructionCost::getInvalid();
1340 
1341     auto *VT = cast<FixedVectorType>(VecTy);
1342 
1343     unsigned NumElts = VT->getNumElements();
1344     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1345 
1346     unsigned NumSubElts = NumElts / Factor;
1347     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1348 
1349     // Firstly, the cost of load/store operation.
1350     InstructionCost Cost;
1351     if (UseMaskForCond || UseMaskForGaps)
1352       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1353                                             AddressSpace, CostKind);
1354     else
1355       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1356                                       CostKind);
1357 
1358     // Legalize the vector type, and get the legalized and unlegalized type
1359     // sizes.
1360     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1361     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1362     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1363 
1364     // Scale the cost of the memory operation by the fraction of legalized
1365     // instructions that will actually be used. We shouldn't account for the
1366     // cost of dead instructions since they will be removed.
1367     //
1368     // E.g., An interleaved load of factor 8:
1369     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1370     //       %v0 = shufflevector %vec, undef, <0, 8>
1371     //
1372     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1373     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1374     // type). The other loads are unused.
1375     //
1376     // TODO: Note that legalization can turn masked loads/stores into unmasked
1377     // (legalized) loads/stores. This can be reflected in the cost.
1378     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1379       // The number of loads of a legal type it will take to represent a load
1380       // of the unlegalized vector type.
1381       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1382 
1383       // The number of elements of the unlegalized type that correspond to a
1384       // single legal instruction.
1385       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1386 
1387       // Determine which legal instructions will be used.
1388       BitVector UsedInsts(NumLegalInsts, false);
1389       for (unsigned Index : Indices)
1390         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1391           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1392 
1393       // Scale the cost of the load by the fraction of legal instructions that
1394       // will be used.
1395       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1396     }
1397 
1398     // Then plus the cost of interleave operation.
1399     assert(Indices.size() <= Factor &&
1400            "Interleaved memory op has too many members");
1401 
1402     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1403     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1404 
1405     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1406     for (unsigned Index : Indices) {
1407       assert(Index < Factor && "Invalid index for interleaved memory op");
1408       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1409         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1410     }
1411 
1412     if (Opcode == Instruction::Load) {
1413       // The interleave cost is similar to extract sub vectors' elements
1414       // from the wide vector, and insert them into sub vectors.
1415       //
1416       // E.g. An interleaved load of factor 2 (with one member of index 0):
1417       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1418       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1419       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1420       // <8 x i32> vector and insert them into a <4 x i32> vector.
1421       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1422           SubVT, DemandedAllSubElts,
1423           /*Insert*/ true, /*Extract*/ false, CostKind);
1424       Cost += Indices.size() * InsSubCost;
1425       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1426                                                 /*Insert*/ false,
1427                                                 /*Extract*/ true, CostKind);
1428     } else {
1429       // The interleave cost is extract elements from sub vectors, and
1430       // insert them into the wide vector.
1431       //
1432       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1433       // (using VF=4):
1434       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1435       //    %gaps.mask = <true, true, false, true, true, false,
1436       //                  true, true, false, true, true, false>
1437       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1438       //                           i32 Align, <12 x i1> %gaps.mask
1439       // The cost is estimated as extract all elements (of actual members,
1440       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1441       // i32> vector.
1442       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1443           SubVT, DemandedAllSubElts,
1444           /*Insert*/ false, /*Extract*/ true, CostKind);
1445       Cost += ExtSubCost * Indices.size();
1446       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1447                                                 /*Insert*/ true,
1448                                                 /*Extract*/ false, CostKind);
1449     }
1450 
1451     if (!UseMaskForCond)
1452       return Cost;
1453 
1454     Type *I8Type = Type::getInt8Ty(VT->getContext());
1455 
1456     Cost += thisT()->getReplicationShuffleCost(
1457         I8Type, Factor, NumSubElts,
1458         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1459         CostKind);
1460 
1461     // The Gaps mask is invariant and created outside the loop, therefore the
1462     // cost of creating it is not accounted for here. However if we have both
1463     // a MaskForGaps and some other mask that guards the execution of the
1464     // memory access, we need to account for the cost of And-ing the two masks
1465     // inside the loop.
1466     if (UseMaskForGaps) {
1467       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1468       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1469                                               CostKind);
1470     }
1471 
1472     return Cost;
1473   }
1474 
1475   /// Get intrinsic cost based on arguments.
1476   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1477                                         TTI::TargetCostKind CostKind) {
1478     // Check for generically free intrinsics.
1479     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1480       return 0;
1481 
1482     // Assume that target intrinsics are cheap.
1483     Intrinsic::ID IID = ICA.getID();
1484     if (Function::isTargetIntrinsic(IID))
1485       return TargetTransformInfo::TCC_Basic;
1486 
1487     if (ICA.isTypeBasedOnly())
1488       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1489 
1490     Type *RetTy = ICA.getReturnType();
1491 
1492     ElementCount RetVF =
1493         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1494                              : ElementCount::getFixed(1));
1495     const IntrinsicInst *I = ICA.getInst();
1496     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1497     FastMathFlags FMF = ICA.getFlags();
1498     switch (IID) {
1499     default:
1500       break;
1501 
1502     case Intrinsic::powi:
1503       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1504         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1505         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1506                                                ShouldOptForSize)) {
1507           // The cost is modeled on the expansion performed by ExpandPowI in
1508           // SelectionDAGBuilder.
1509           APInt Exponent = RHSC->getValue().abs();
1510           unsigned ActiveBits = Exponent.getActiveBits();
1511           unsigned PopCount = Exponent.countPopulation();
1512           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1513                                  thisT()->getArithmeticInstrCost(
1514                                      Instruction::FMul, RetTy, CostKind);
1515           if (RHSC->getSExtValue() < 0)
1516             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1517                                                     CostKind);
1518           return Cost;
1519         }
1520       }
1521       break;
1522     case Intrinsic::cttz:
1523       // FIXME: If necessary, this should go in target-specific overrides.
1524       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1525         return TargetTransformInfo::TCC_Basic;
1526       break;
1527 
1528     case Intrinsic::ctlz:
1529       // FIXME: If necessary, this should go in target-specific overrides.
1530       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1531         return TargetTransformInfo::TCC_Basic;
1532       break;
1533 
1534     case Intrinsic::memcpy:
1535       return thisT()->getMemcpyCost(ICA.getInst());
1536 
1537     case Intrinsic::masked_scatter: {
1538       const Value *Mask = Args[3];
1539       bool VarMask = !isa<Constant>(Mask);
1540       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1541       return thisT()->getGatherScatterOpCost(Instruction::Store,
1542                                              ICA.getArgTypes()[0], Args[1],
1543                                              VarMask, Alignment, CostKind, I);
1544     }
1545     case Intrinsic::masked_gather: {
1546       const Value *Mask = Args[2];
1547       bool VarMask = !isa<Constant>(Mask);
1548       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1549       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1550                                              VarMask, Alignment, CostKind, I);
1551     }
1552     case Intrinsic::experimental_stepvector: {
1553       if (isa<ScalableVectorType>(RetTy))
1554         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1555       // The cost of materialising a constant integer vector.
1556       return TargetTransformInfo::TCC_Basic;
1557     }
1558     case Intrinsic::vector_extract: {
1559       // FIXME: Handle case where a scalable vector is extracted from a scalable
1560       // vector
1561       if (isa<ScalableVectorType>(RetTy))
1562         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1563       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1564       return thisT()->getShuffleCost(
1565           TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1566           std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1567     }
1568     case Intrinsic::vector_insert: {
1569       // FIXME: Handle case where a scalable vector is inserted into a scalable
1570       // vector
1571       if (isa<ScalableVectorType>(Args[1]->getType()))
1572         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1573       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1574       return thisT()->getShuffleCost(
1575           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1576           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1577     }
1578     case Intrinsic::experimental_vector_reverse: {
1579       return thisT()->getShuffleCost(
1580           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1581           CostKind, 0, cast<VectorType>(RetTy));
1582     }
1583     case Intrinsic::experimental_vector_splice: {
1584       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1585       return thisT()->getShuffleCost(
1586           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1587           CostKind, Index, cast<VectorType>(RetTy));
1588     }
1589     case Intrinsic::vector_reduce_add:
1590     case Intrinsic::vector_reduce_mul:
1591     case Intrinsic::vector_reduce_and:
1592     case Intrinsic::vector_reduce_or:
1593     case Intrinsic::vector_reduce_xor:
1594     case Intrinsic::vector_reduce_smax:
1595     case Intrinsic::vector_reduce_smin:
1596     case Intrinsic::vector_reduce_fmax:
1597     case Intrinsic::vector_reduce_fmin:
1598     case Intrinsic::vector_reduce_umax:
1599     case Intrinsic::vector_reduce_umin: {
1600       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1601       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1602     }
1603     case Intrinsic::vector_reduce_fadd:
1604     case Intrinsic::vector_reduce_fmul: {
1605       IntrinsicCostAttributes Attrs(
1606           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1607       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1608     }
1609     case Intrinsic::fshl:
1610     case Intrinsic::fshr: {
1611       const Value *X = Args[0];
1612       const Value *Y = Args[1];
1613       const Value *Z = Args[2];
1614       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1615       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1616       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1617       const TTI::OperandValueInfo OpInfoBW =
1618         {TTI::OK_UniformConstantValue,
1619          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1620          : TTI::OP_None};
1621 
1622       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1623       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1624       InstructionCost Cost = 0;
1625       Cost +=
1626           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1627       Cost +=
1628           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1629       Cost += thisT()->getArithmeticInstrCost(
1630           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1631           {OpInfoZ.Kind, TTI::OP_None});
1632       Cost += thisT()->getArithmeticInstrCost(
1633           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1634           {OpInfoZ.Kind, TTI::OP_None});
1635       // Non-constant shift amounts requires a modulo.
1636       if (!OpInfoZ.isConstant())
1637         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1638                                                 CostKind, OpInfoZ, OpInfoBW);
1639       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1640       if (X != Y) {
1641         Type *CondTy = RetTy->getWithNewBitWidth(1);
1642         Cost +=
1643             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1644                                         CmpInst::ICMP_EQ, CostKind);
1645         Cost +=
1646             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1647                                         CmpInst::ICMP_EQ, CostKind);
1648       }
1649       return Cost;
1650     }
1651     case Intrinsic::get_active_lane_mask: {
1652       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1653       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1654 
1655       // If we're not expanding the intrinsic then we assume this is cheap
1656       // to implement.
1657       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1658         return getTypeLegalizationCost(RetTy).first;
1659       }
1660 
1661       // Create the expanded types that will be used to calculate the uadd_sat
1662       // operation.
1663       Type *ExpRetTy = VectorType::get(
1664           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1665       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1666       InstructionCost Cost =
1667           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1668       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1669                                           CmpInst::ICMP_ULT, CostKind);
1670       return Cost;
1671     }
1672     }
1673 
1674     // Assume that we need to scalarize this intrinsic.
1675     // Compute the scalarization overhead based on Args for a vector
1676     // intrinsic.
1677     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1678     if (RetVF.isVector() && !RetVF.isScalable()) {
1679       ScalarizationCost = 0;
1680       if (!RetTy->isVoidTy())
1681         ScalarizationCost += getScalarizationOverhead(
1682             cast<VectorType>(RetTy),
1683             /*Insert*/ true, /*Extract*/ false, CostKind);
1684       ScalarizationCost +=
1685           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1686     }
1687 
1688     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1689                                   ScalarizationCost);
1690     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1691   }
1692 
1693   /// Get intrinsic cost based on argument types.
1694   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1695   /// cost of scalarizing the arguments and the return value will be computed
1696   /// based on types.
1697   InstructionCost
1698   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1699                                  TTI::TargetCostKind CostKind) {
1700     Intrinsic::ID IID = ICA.getID();
1701     Type *RetTy = ICA.getReturnType();
1702     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1703     FastMathFlags FMF = ICA.getFlags();
1704     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1705     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1706 
1707     VectorType *VecOpTy = nullptr;
1708     if (!Tys.empty()) {
1709       // The vector reduction operand is operand 0 except for fadd/fmul.
1710       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1711       unsigned VecTyIndex = 0;
1712       if (IID == Intrinsic::vector_reduce_fadd ||
1713           IID == Intrinsic::vector_reduce_fmul)
1714         VecTyIndex = 1;
1715       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1716       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1717     }
1718 
1719     // Library call cost - other than size, make it expensive.
1720     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1721     unsigned ISD = 0;
1722     switch (IID) {
1723     default: {
1724       // Scalable vectors cannot be scalarized, so return Invalid.
1725       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1726             return isa<ScalableVectorType>(Ty);
1727           }))
1728         return InstructionCost::getInvalid();
1729 
1730       // Assume that we need to scalarize this intrinsic.
1731       InstructionCost ScalarizationCost =
1732           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1733       unsigned ScalarCalls = 1;
1734       Type *ScalarRetTy = RetTy;
1735       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1736         if (!SkipScalarizationCost)
1737           ScalarizationCost = getScalarizationOverhead(
1738               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1739         ScalarCalls = std::max(ScalarCalls,
1740                                cast<FixedVectorType>(RetVTy)->getNumElements());
1741         ScalarRetTy = RetTy->getScalarType();
1742       }
1743       SmallVector<Type *, 4> ScalarTys;
1744       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1745         Type *Ty = Tys[i];
1746         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1747           if (!SkipScalarizationCost)
1748             ScalarizationCost += getScalarizationOverhead(
1749                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1750           ScalarCalls = std::max(ScalarCalls,
1751                                  cast<FixedVectorType>(VTy)->getNumElements());
1752           Ty = Ty->getScalarType();
1753         }
1754         ScalarTys.push_back(Ty);
1755       }
1756       if (ScalarCalls == 1)
1757         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1758 
1759       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1760       InstructionCost ScalarCost =
1761           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1762 
1763       return ScalarCalls * ScalarCost + ScalarizationCost;
1764     }
1765     // Look for intrinsics that can be lowered directly or turned into a scalar
1766     // intrinsic call.
1767     case Intrinsic::sqrt:
1768       ISD = ISD::FSQRT;
1769       break;
1770     case Intrinsic::sin:
1771       ISD = ISD::FSIN;
1772       break;
1773     case Intrinsic::cos:
1774       ISD = ISD::FCOS;
1775       break;
1776     case Intrinsic::exp:
1777       ISD = ISD::FEXP;
1778       break;
1779     case Intrinsic::exp2:
1780       ISD = ISD::FEXP2;
1781       break;
1782     case Intrinsic::log:
1783       ISD = ISD::FLOG;
1784       break;
1785     case Intrinsic::log10:
1786       ISD = ISD::FLOG10;
1787       break;
1788     case Intrinsic::log2:
1789       ISD = ISD::FLOG2;
1790       break;
1791     case Intrinsic::fabs:
1792       ISD = ISD::FABS;
1793       break;
1794     case Intrinsic::canonicalize:
1795       ISD = ISD::FCANONICALIZE;
1796       break;
1797     case Intrinsic::minnum:
1798       ISD = ISD::FMINNUM;
1799       break;
1800     case Intrinsic::maxnum:
1801       ISD = ISD::FMAXNUM;
1802       break;
1803     case Intrinsic::minimum:
1804       ISD = ISD::FMINIMUM;
1805       break;
1806     case Intrinsic::maximum:
1807       ISD = ISD::FMAXIMUM;
1808       break;
1809     case Intrinsic::copysign:
1810       ISD = ISD::FCOPYSIGN;
1811       break;
1812     case Intrinsic::floor:
1813       ISD = ISD::FFLOOR;
1814       break;
1815     case Intrinsic::ceil:
1816       ISD = ISD::FCEIL;
1817       break;
1818     case Intrinsic::trunc:
1819       ISD = ISD::FTRUNC;
1820       break;
1821     case Intrinsic::nearbyint:
1822       ISD = ISD::FNEARBYINT;
1823       break;
1824     case Intrinsic::rint:
1825       ISD = ISD::FRINT;
1826       break;
1827     case Intrinsic::round:
1828       ISD = ISD::FROUND;
1829       break;
1830     case Intrinsic::roundeven:
1831       ISD = ISD::FROUNDEVEN;
1832       break;
1833     case Intrinsic::pow:
1834       ISD = ISD::FPOW;
1835       break;
1836     case Intrinsic::fma:
1837       ISD = ISD::FMA;
1838       break;
1839     case Intrinsic::fmuladd:
1840       ISD = ISD::FMA;
1841       break;
1842     case Intrinsic::experimental_constrained_fmuladd:
1843       ISD = ISD::STRICT_FMA;
1844       break;
1845     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1846     case Intrinsic::lifetime_start:
1847     case Intrinsic::lifetime_end:
1848     case Intrinsic::sideeffect:
1849     case Intrinsic::pseudoprobe:
1850     case Intrinsic::arithmetic_fence:
1851       return 0;
1852     case Intrinsic::masked_store: {
1853       Type *Ty = Tys[0];
1854       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1855       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1856                                             CostKind);
1857     }
1858     case Intrinsic::masked_load: {
1859       Type *Ty = RetTy;
1860       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1861       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1862                                             CostKind);
1863     }
1864     case Intrinsic::vector_reduce_add:
1865       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1866                                                  std::nullopt, CostKind);
1867     case Intrinsic::vector_reduce_mul:
1868       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1869                                                  std::nullopt, CostKind);
1870     case Intrinsic::vector_reduce_and:
1871       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1872                                                  std::nullopt, CostKind);
1873     case Intrinsic::vector_reduce_or:
1874       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1875                                                  std::nullopt, CostKind);
1876     case Intrinsic::vector_reduce_xor:
1877       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1878                                                  std::nullopt, CostKind);
1879     case Intrinsic::vector_reduce_fadd:
1880       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1881                                                  FMF, CostKind);
1882     case Intrinsic::vector_reduce_fmul:
1883       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1884                                                  FMF, CostKind);
1885     case Intrinsic::vector_reduce_smax:
1886     case Intrinsic::vector_reduce_smin:
1887     case Intrinsic::vector_reduce_fmax:
1888     case Intrinsic::vector_reduce_fmin:
1889       return thisT()->getMinMaxReductionCost(
1890           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1891           /*IsUnsigned=*/false, CostKind);
1892     case Intrinsic::vector_reduce_umax:
1893     case Intrinsic::vector_reduce_umin:
1894       return thisT()->getMinMaxReductionCost(
1895           VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)),
1896           /*IsUnsigned=*/true, CostKind);
1897     case Intrinsic::abs: {
1898       // abs(X) = select(icmp(X,0),X,sub(0,X))
1899       Type *CondTy = RetTy->getWithNewBitWidth(1);
1900       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1901       InstructionCost Cost = 0;
1902       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1903                                           Pred, CostKind);
1904       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1905                                           Pred, CostKind);
1906       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1907       Cost += thisT()->getArithmeticInstrCost(
1908          BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
1909       return Cost;
1910     }
1911     case Intrinsic::smax:
1912     case Intrinsic::smin:
1913     case Intrinsic::umax:
1914     case Intrinsic::umin: {
1915       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1916       Type *CondTy = RetTy->getWithNewBitWidth(1);
1917       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1918       CmpInst::Predicate Pred =
1919           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1920       InstructionCost Cost = 0;
1921       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1922                                           Pred, CostKind);
1923       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1924                                           Pred, CostKind);
1925       return Cost;
1926     }
1927     case Intrinsic::sadd_sat:
1928     case Intrinsic::ssub_sat: {
1929       Type *CondTy = RetTy->getWithNewBitWidth(1);
1930 
1931       Type *OpTy = StructType::create({RetTy, CondTy});
1932       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1933                                      ? Intrinsic::sadd_with_overflow
1934                                      : Intrinsic::ssub_with_overflow;
1935       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1936 
1937       // SatMax -> Overflow && SumDiff < 0
1938       // SatMin -> Overflow && SumDiff >= 0
1939       InstructionCost Cost = 0;
1940       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1941                                     nullptr, ScalarizationCostPassed);
1942       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1943       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1944                                           Pred, CostKind);
1945       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1946                                               CondTy, Pred, CostKind);
1947       return Cost;
1948     }
1949     case Intrinsic::uadd_sat:
1950     case Intrinsic::usub_sat: {
1951       Type *CondTy = RetTy->getWithNewBitWidth(1);
1952 
1953       Type *OpTy = StructType::create({RetTy, CondTy});
1954       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1955                                      ? Intrinsic::uadd_with_overflow
1956                                      : Intrinsic::usub_with_overflow;
1957 
1958       InstructionCost Cost = 0;
1959       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1960                                     nullptr, ScalarizationCostPassed);
1961       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1962       Cost +=
1963           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1964                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1965       return Cost;
1966     }
1967     case Intrinsic::smul_fix:
1968     case Intrinsic::umul_fix: {
1969       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1970       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1971 
1972       unsigned ExtOp =
1973           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1974       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1975 
1976       InstructionCost Cost = 0;
1977       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1978       Cost +=
1979           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
1980       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
1981                                             CCH, CostKind);
1982       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
1983                                               CostKind,
1984                                               {TTI::OK_AnyValue, TTI::OP_None},
1985                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
1986       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
1987                                               {TTI::OK_AnyValue, TTI::OP_None},
1988                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
1989       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
1990       return Cost;
1991     }
1992     case Intrinsic::sadd_with_overflow:
1993     case Intrinsic::ssub_with_overflow: {
1994       Type *SumTy = RetTy->getContainedType(0);
1995       Type *OverflowTy = RetTy->getContainedType(1);
1996       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
1997                             ? BinaryOperator::Add
1998                             : BinaryOperator::Sub;
1999 
2000       //   Add:
2001       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2002       //   Sub:
2003       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2004       InstructionCost Cost = 0;
2005       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2006       Cost += 2 * thisT()->getCmpSelInstrCost(
2007                       Instruction::ICmp, SumTy, OverflowTy,
2008                       CmpInst::ICMP_SGT, CostKind);
2009       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2010                                               CostKind);
2011       return Cost;
2012     }
2013     case Intrinsic::uadd_with_overflow:
2014     case Intrinsic::usub_with_overflow: {
2015       Type *SumTy = RetTy->getContainedType(0);
2016       Type *OverflowTy = RetTy->getContainedType(1);
2017       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2018                             ? BinaryOperator::Add
2019                             : BinaryOperator::Sub;
2020       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2021                                     ? CmpInst::ICMP_ULT
2022                                     : CmpInst::ICMP_UGT;
2023 
2024       InstructionCost Cost = 0;
2025       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2026       Cost +=
2027           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2028                                       Pred, CostKind);
2029       return Cost;
2030     }
2031     case Intrinsic::smul_with_overflow:
2032     case Intrinsic::umul_with_overflow: {
2033       Type *MulTy = RetTy->getContainedType(0);
2034       Type *OverflowTy = RetTy->getContainedType(1);
2035       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2036       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2037       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2038 
2039       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2040       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2041 
2042       InstructionCost Cost = 0;
2043       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2044       Cost +=
2045           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2046       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2047                                             CCH, CostKind);
2048       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2049                                               CostKind,
2050                                               {TTI::OK_AnyValue, TTI::OP_None},
2051                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2052 
2053       if (IsSigned)
2054         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2055                                                 CostKind,
2056                                                 {TTI::OK_AnyValue, TTI::OP_None},
2057                                                 {TTI::OK_UniformConstantValue, TTI::OP_None});
2058 
2059       Cost += thisT()->getCmpSelInstrCost(
2060           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2061       return Cost;
2062     }
2063     case Intrinsic::fptosi_sat:
2064     case Intrinsic::fptoui_sat: {
2065       if (Tys.empty())
2066         break;
2067       Type *FromTy = Tys[0];
2068       bool IsSigned = IID == Intrinsic::fptosi_sat;
2069 
2070       InstructionCost Cost = 0;
2071       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2072                                      {FromTy, FromTy});
2073       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2074       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2075                                      {FromTy, FromTy});
2076       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2077       Cost += thisT()->getCastInstrCost(
2078           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2079           TTI::CastContextHint::None, CostKind);
2080       if (IsSigned) {
2081         Type *CondTy = RetTy->getWithNewBitWidth(1);
2082         Cost += thisT()->getCmpSelInstrCost(
2083             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2084         Cost += thisT()->getCmpSelInstrCost(
2085             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2086       }
2087       return Cost;
2088     }
2089     case Intrinsic::ctpop:
2090       ISD = ISD::CTPOP;
2091       // In case of legalization use TCC_Expensive. This is cheaper than a
2092       // library call but still not a cheap instruction.
2093       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2094       break;
2095     case Intrinsic::ctlz:
2096       ISD = ISD::CTLZ;
2097       break;
2098     case Intrinsic::cttz:
2099       ISD = ISD::CTTZ;
2100       break;
2101     case Intrinsic::bswap:
2102       ISD = ISD::BSWAP;
2103       break;
2104     case Intrinsic::bitreverse:
2105       ISD = ISD::BITREVERSE;
2106       break;
2107     }
2108 
2109     const TargetLoweringBase *TLI = getTLI();
2110     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2111 
2112     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2113       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2114           TLI->isFAbsFree(LT.second)) {
2115         return 0;
2116       }
2117 
2118       // The operation is legal. Assume it costs 1.
2119       // If the type is split to multiple registers, assume that there is some
2120       // overhead to this.
2121       // TODO: Once we have extract/insert subvector cost we need to use them.
2122       if (LT.first > 1)
2123         return (LT.first * 2);
2124       else
2125         return (LT.first * 1);
2126     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2127       // If the operation is custom lowered then assume
2128       // that the code is twice as expensive.
2129       return (LT.first * 2);
2130     }
2131 
2132     // If we can't lower fmuladd into an FMA estimate the cost as a floating
2133     // point mul followed by an add.
2134     if (IID == Intrinsic::fmuladd)
2135       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2136                                              CostKind) +
2137              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2138                                              CostKind);
2139     if (IID == Intrinsic::experimental_constrained_fmuladd) {
2140       IntrinsicCostAttributes FMulAttrs(
2141         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2142       IntrinsicCostAttributes FAddAttrs(
2143         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2144       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2145              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2146     }
2147 
2148     // Else, assume that we need to scalarize this intrinsic. For math builtins
2149     // this will emit a costly libcall, adding call overhead and spills. Make it
2150     // very expensive.
2151     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2152       // Scalable vectors cannot be scalarized, so return Invalid.
2153       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2154             return isa<ScalableVectorType>(Ty);
2155           }))
2156         return InstructionCost::getInvalid();
2157 
2158       InstructionCost ScalarizationCost =
2159           SkipScalarizationCost
2160               ? ScalarizationCostPassed
2161               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2162                                          /*Extract*/ false, CostKind);
2163 
2164       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2165       SmallVector<Type *, 4> ScalarTys;
2166       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2167         Type *Ty = Tys[i];
2168         if (Ty->isVectorTy())
2169           Ty = Ty->getScalarType();
2170         ScalarTys.push_back(Ty);
2171       }
2172       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2173       InstructionCost ScalarCost =
2174           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2175       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2176         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2177           if (!ICA.skipScalarizationCost())
2178             ScalarizationCost += getScalarizationOverhead(
2179                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2180           ScalarCalls = std::max(ScalarCalls,
2181                                  cast<FixedVectorType>(VTy)->getNumElements());
2182         }
2183       }
2184       return ScalarCalls * ScalarCost + ScalarizationCost;
2185     }
2186 
2187     // This is going to be turned into a library call, make it expensive.
2188     return SingleCallCost;
2189   }
2190 
2191   /// Compute a cost of the given call instruction.
2192   ///
2193   /// Compute the cost of calling function F with return type RetTy and
2194   /// argument types Tys. F might be nullptr, in this case the cost of an
2195   /// arbitrary call with the specified signature will be returned.
2196   /// This is used, for instance,  when we estimate call of a vector
2197   /// counterpart of the given function.
2198   /// \param F Called function, might be nullptr.
2199   /// \param RetTy Return value types.
2200   /// \param Tys Argument types.
2201   /// \returns The cost of Call instruction.
2202   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2203                                    ArrayRef<Type *> Tys,
2204                                    TTI::TargetCostKind CostKind) {
2205     return 10;
2206   }
2207 
2208   unsigned getNumberOfParts(Type *Tp) {
2209     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2210     return LT.first.isValid() ? *LT.first.getValue() : 0;
2211   }
2212 
2213   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2214                                             const SCEV *) {
2215     return 0;
2216   }
2217 
2218   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2219   /// We're assuming that reduction operation are performing the following way:
2220   ///
2221   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2222   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2223   ///            \----------------v-------------/  \----------v------------/
2224   ///                            n/2 elements               n/2 elements
2225   /// %red1 = op <n x t> %val, <n x t> val1
2226   /// After this operation we have a vector %red1 where only the first n/2
2227   /// elements are meaningful, the second n/2 elements are undefined and can be
2228   /// dropped. All other operations are actually working with the vector of
2229   /// length n/2, not n, though the real vector length is still n.
2230   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2231   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2232   ///            \----------------v-------------/  \----------v------------/
2233   ///                            n/4 elements               3*n/4 elements
2234   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2235   /// length n/2, the resulting vector has length n/4 etc.
2236   ///
2237   /// The cost model should take into account that the actual length of the
2238   /// vector is reduced on each iteration.
2239   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2240                                        TTI::TargetCostKind CostKind) {
2241     // Targets must implement a default value for the scalable case, since
2242     // we don't know how many lanes the vector has.
2243     if (isa<ScalableVectorType>(Ty))
2244       return InstructionCost::getInvalid();
2245 
2246     Type *ScalarTy = Ty->getElementType();
2247     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2248     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2249         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2250         NumVecElts >= 2) {
2251       // Or reduction for i1 is represented as:
2252       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2253       // %res = cmp ne iReduxWidth %val, 0
2254       // And reduction for i1 is represented as:
2255       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2256       // %res = cmp eq iReduxWidth %val, 11111
2257       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2258       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2259                                        TTI::CastContextHint::None, CostKind) +
2260              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2261                                          CmpInst::makeCmpResultType(ValTy),
2262                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2263     }
2264     unsigned NumReduxLevels = Log2_32(NumVecElts);
2265     InstructionCost ArithCost = 0;
2266     InstructionCost ShuffleCost = 0;
2267     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2268     unsigned LongVectorCount = 0;
2269     unsigned MVTLen =
2270         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2271     while (NumVecElts > MVTLen) {
2272       NumVecElts /= 2;
2273       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2274       ShuffleCost +=
2275           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2276                                   CostKind, NumVecElts, SubTy);
2277       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2278       Ty = SubTy;
2279       ++LongVectorCount;
2280     }
2281 
2282     NumReduxLevels -= LongVectorCount;
2283 
2284     // The minimal length of the vector is limited by the real length of vector
2285     // operations performed on the current platform. That's why several final
2286     // reduction operations are performed on the vectors with the same
2287     // architecture-dependent length.
2288 
2289     // By default reductions need one shuffle per reduction level.
2290     ShuffleCost +=
2291         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2292                                                  std::nullopt, CostKind, 0, Ty);
2293     ArithCost +=
2294         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2295     return ShuffleCost + ArithCost +
2296            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2297                                        CostKind, 0, nullptr, nullptr);
2298   }
2299 
2300   /// Try to calculate the cost of performing strict (in-order) reductions,
2301   /// which involves doing a sequence of floating point additions in lane
2302   /// order, starting with an initial value. For example, consider a scalar
2303   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2304   ///
2305   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2306   ///
2307   ///   %add1 = %InitVal + %v0
2308   ///   %add2 = %add1 + %v1
2309   ///   %add3 = %add2 + %v2
2310   ///   %add4 = %add3 + %v3
2311   ///
2312   /// As a simple estimate we can say the cost of such a reduction is 4 times
2313   /// the cost of a scalar FP addition. We can only estimate the costs for
2314   /// fixed-width vectors here because for scalable vectors we do not know the
2315   /// runtime number of operations.
2316   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2317                                           TTI::TargetCostKind CostKind) {
2318     // Targets must implement a default value for the scalable case, since
2319     // we don't know how many lanes the vector has.
2320     if (isa<ScalableVectorType>(Ty))
2321       return InstructionCost::getInvalid();
2322 
2323     auto *VTy = cast<FixedVectorType>(Ty);
2324     InstructionCost ExtractCost = getScalarizationOverhead(
2325         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2326     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2327         Opcode, VTy->getElementType(), CostKind);
2328     ArithCost *= VTy->getNumElements();
2329 
2330     return ExtractCost + ArithCost;
2331   }
2332 
2333   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2334                                              std::optional<FastMathFlags> FMF,
2335                                              TTI::TargetCostKind CostKind) {
2336     if (TTI::requiresOrderedReduction(FMF))
2337       return getOrderedReductionCost(Opcode, Ty, CostKind);
2338     return getTreeReductionCost(Opcode, Ty, CostKind);
2339   }
2340 
2341   /// Try to calculate op costs for min/max reduction operations.
2342   /// \param CondTy Conditional type for the Select instruction.
2343   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
2344                                          bool IsUnsigned,
2345                                          TTI::TargetCostKind CostKind) {
2346     // Targets must implement a default value for the scalable case, since
2347     // we don't know how many lanes the vector has.
2348     if (isa<ScalableVectorType>(Ty))
2349       return InstructionCost::getInvalid();
2350 
2351     Type *ScalarTy = Ty->getElementType();
2352     Type *ScalarCondTy = CondTy->getElementType();
2353     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2354     unsigned NumReduxLevels = Log2_32(NumVecElts);
2355     unsigned CmpOpcode;
2356     if (Ty->isFPOrFPVectorTy()) {
2357       CmpOpcode = Instruction::FCmp;
2358     } else {
2359       assert(Ty->isIntOrIntVectorTy() &&
2360              "expecting floating point or integer type for min/max reduction");
2361       CmpOpcode = Instruction::ICmp;
2362     }
2363     InstructionCost MinMaxCost = 0;
2364     InstructionCost ShuffleCost = 0;
2365     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2366     unsigned LongVectorCount = 0;
2367     unsigned MVTLen =
2368         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2369     while (NumVecElts > MVTLen) {
2370       NumVecElts /= 2;
2371       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2372       CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts);
2373 
2374       ShuffleCost +=
2375           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2376                                   CostKind, NumVecElts, SubTy);
2377       MinMaxCost +=
2378           thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy,
2379                                       CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2380           thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy,
2381                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
2382       Ty = SubTy;
2383       ++LongVectorCount;
2384     }
2385 
2386     NumReduxLevels -= LongVectorCount;
2387 
2388     // The minimal length of the vector is limited by the real length of vector
2389     // operations performed on the current platform. That's why several final
2390     // reduction opertions are perfomed on the vectors with the same
2391     // architecture-dependent length.
2392     ShuffleCost +=
2393         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2394                                                  std::nullopt, CostKind, 0, Ty);
2395     MinMaxCost +=
2396         NumReduxLevels *
2397         (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy,
2398                                      CmpInst::BAD_ICMP_PREDICATE, CostKind) +
2399          thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy,
2400                                      CmpInst::BAD_ICMP_PREDICATE, CostKind));
2401     // The last min/max should be in vector registers and we counted it above.
2402     // So just need a single extractelement.
2403     return ShuffleCost + MinMaxCost +
2404            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2405                                        CostKind, 0, nullptr, nullptr);
2406   }
2407 
2408   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2409                                            Type *ResTy, VectorType *Ty,
2410                                            std::optional<FastMathFlags> FMF,
2411                                            TTI::TargetCostKind CostKind) {
2412     // Without any native support, this is equivalent to the cost of
2413     // vecreduce.opcode(ext(Ty A)).
2414     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2415     InstructionCost RedCost =
2416         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2417     InstructionCost ExtCost = thisT()->getCastInstrCost(
2418         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2419         TTI::CastContextHint::None, CostKind);
2420 
2421     return RedCost + ExtCost;
2422   }
2423 
2424   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2425                                          VectorType *Ty,
2426                                          TTI::TargetCostKind CostKind) {
2427     // Without any native support, this is equivalent to the cost of
2428     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2429     // vecreduce.add(mul(A, B)).
2430     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2431     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2432         Instruction::Add, ExtTy, std::nullopt, CostKind);
2433     InstructionCost ExtCost = thisT()->getCastInstrCost(
2434         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2435         TTI::CastContextHint::None, CostKind);
2436 
2437     InstructionCost MulCost =
2438         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2439 
2440     return RedCost + MulCost + 2 * ExtCost;
2441   }
2442 
2443   InstructionCost getVectorSplitCost() { return 1; }
2444 
2445   /// @}
2446 };
2447 
2448 /// Concrete BasicTTIImpl that can be used if no further customization
2449 /// is needed.
2450 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2451   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2452 
2453   friend class BasicTTIImplBase<BasicTTIImpl>;
2454 
2455   const TargetSubtargetInfo *ST;
2456   const TargetLoweringBase *TLI;
2457 
2458   const TargetSubtargetInfo *getST() const { return ST; }
2459   const TargetLoweringBase *getTLI() const { return TLI; }
2460 
2461 public:
2462   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2463 };
2464 
2465 } // end namespace llvm
2466 
2467 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2468