1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This file provides a helper that implements much of the TTI interface in
11 /// terms of the target-independent code generator and TargetLowering
12 /// interfaces.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H
17 #define LLVM_CODEGEN_BASICTTIIMPL_H
18 
19 #include "llvm/ADT/APInt.h"
20 #include "llvm/ADT/ArrayRef.h"
21 #include "llvm/ADT/BitVector.h"
22 #include "llvm/ADT/SmallPtrSet.h"
23 #include "llvm/ADT/SmallVector.h"
24 #include "llvm/Analysis/LoopInfo.h"
25 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
26 #include "llvm/Analysis/TargetTransformInfo.h"
27 #include "llvm/Analysis/TargetTransformInfoImpl.h"
28 #include "llvm/CodeGen/ISDOpcodes.h"
29 #include "llvm/CodeGen/MachineValueType.h"
30 #include "llvm/CodeGen/TargetLowering.h"
31 #include "llvm/CodeGen/TargetSubtargetInfo.h"
32 #include "llvm/CodeGen/ValueTypes.h"
33 #include "llvm/IR/BasicBlock.h"
34 #include "llvm/IR/Constant.h"
35 #include "llvm/IR/Constants.h"
36 #include "llvm/IR/DataLayout.h"
37 #include "llvm/IR/DerivedTypes.h"
38 #include "llvm/IR/InstrTypes.h"
39 #include "llvm/IR/Instruction.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/Intrinsics.h"
42 #include "llvm/IR/Operator.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/Value.h"
45 #include "llvm/Support/Casting.h"
46 #include "llvm/Support/CommandLine.h"
47 #include "llvm/Support/ErrorHandling.h"
48 #include "llvm/Support/MathExtras.h"
49 #include "llvm/Target/TargetMachine.h"
50 #include "llvm/Target/TargetOptions.h"
51 #include <algorithm>
52 #include <cassert>
53 #include <cstdint>
54 #include <limits>
55 #include <optional>
56 #include <utility>
57 
58 namespace llvm {
59 
60 class Function;
61 class GlobalValue;
62 class LLVMContext;
63 class ScalarEvolution;
64 class SCEV;
65 class TargetMachine;
66 
67 extern cl::opt<unsigned> PartialUnrollingThreshold;
68 
69 /// Base class which can be used to help build a TTI implementation.
70 ///
71 /// This class provides as much implementation of the TTI interface as is
72 /// possible using the target independent parts of the code generator.
73 ///
74 /// In order to subclass it, your class must implement a getST() method to
75 /// return the subtarget, and a getTLI() method to return the target lowering.
76 /// We need these methods implemented in the derived class so that this class
77 /// doesn't have to duplicate storage for them.
78 template <typename T>
79 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
80 private:
81   using BaseT = TargetTransformInfoImplCRTPBase<T>;
82   using TTI = TargetTransformInfo;
83 
84   /// Helper function to access this as a T.
85   T *thisT() { return static_cast<T *>(this); }
86 
87   /// Estimate a cost of Broadcast as an extract and sequence of insert
88   /// operations.
89   InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy,
90                                               TTI::TargetCostKind CostKind) {
91     InstructionCost Cost = 0;
92     // Broadcast cost is equal to the cost of extracting the zero'th element
93     // plus the cost of inserting it into every element of the result vector.
94     Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
95                                         CostKind, 0, nullptr, nullptr);
96 
97     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
98       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
99                                           CostKind, i, nullptr, nullptr);
100     }
101     return Cost;
102   }
103 
104   /// Estimate a cost of shuffle as a sequence of extract and insert
105   /// operations.
106   InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy,
107                                             TTI::TargetCostKind CostKind) {
108     InstructionCost Cost = 0;
109     // Shuffle cost is equal to the cost of extracting element from its argument
110     // plus the cost of inserting them onto the result vector.
111 
112     // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from
113     // index 0 of first vector, index 1 of second vector,index 2 of first
114     // vector and finally index 3 of second vector and insert them at index
115     // <0,1,2,3> of result vector.
116     for (int i = 0, e = VTy->getNumElements(); i < e; ++i) {
117       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy,
118                                           CostKind, i, nullptr, nullptr);
119       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
120                                           CostKind, i, nullptr, nullptr);
121     }
122     return Cost;
123   }
124 
125   /// Estimate a cost of subvector extraction as a sequence of extract and
126   /// insert operations.
127   InstructionCost getExtractSubvectorOverhead(VectorType *VTy,
128                                               TTI::TargetCostKind CostKind,
129                                               int Index,
130                                               FixedVectorType *SubVTy) {
131     assert(VTy && SubVTy &&
132            "Can only extract subvectors from vectors");
133     int NumSubElts = SubVTy->getNumElements();
134     assert((!isa<FixedVectorType>(VTy) ||
135             (Index + NumSubElts) <=
136                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
137            "SK_ExtractSubvector index out of range");
138 
139     InstructionCost Cost = 0;
140     // Subvector extraction cost is equal to the cost of extracting element from
141     // the source type plus the cost of inserting them into the result vector
142     // type.
143     for (int i = 0; i != NumSubElts; ++i) {
144       Cost +=
145           thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy,
146                                       CostKind, i + Index, nullptr, nullptr);
147       Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy,
148                                           CostKind, i, nullptr, nullptr);
149     }
150     return Cost;
151   }
152 
153   /// Estimate a cost of subvector insertion as a sequence of extract and
154   /// insert operations.
155   InstructionCost getInsertSubvectorOverhead(VectorType *VTy,
156                                              TTI::TargetCostKind CostKind,
157                                              int Index,
158                                              FixedVectorType *SubVTy) {
159     assert(VTy && SubVTy &&
160            "Can only insert subvectors into vectors");
161     int NumSubElts = SubVTy->getNumElements();
162     assert((!isa<FixedVectorType>(VTy) ||
163             (Index + NumSubElts) <=
164                 (int)cast<FixedVectorType>(VTy)->getNumElements()) &&
165            "SK_InsertSubvector index out of range");
166 
167     InstructionCost Cost = 0;
168     // Subvector insertion cost is equal to the cost of extracting element from
169     // the source type plus the cost of inserting them into the result vector
170     // type.
171     for (int i = 0; i != NumSubElts; ++i) {
172       Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy,
173                                           CostKind, i, nullptr, nullptr);
174       Cost +=
175           thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, CostKind,
176                                       i + Index, nullptr, nullptr);
177     }
178     return Cost;
179   }
180 
181   /// Local query method delegates up to T which *must* implement this!
182   const TargetSubtargetInfo *getST() const {
183     return static_cast<const T *>(this)->getST();
184   }
185 
186   /// Local query method delegates up to T which *must* implement this!
187   const TargetLoweringBase *getTLI() const {
188     return static_cast<const T *>(this)->getTLI();
189   }
190 
191   static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) {
192     switch (M) {
193       case TTI::MIM_Unindexed:
194         return ISD::UNINDEXED;
195       case TTI::MIM_PreInc:
196         return ISD::PRE_INC;
197       case TTI::MIM_PreDec:
198         return ISD::PRE_DEC;
199       case TTI::MIM_PostInc:
200         return ISD::POST_INC;
201       case TTI::MIM_PostDec:
202         return ISD::POST_DEC;
203     }
204     llvm_unreachable("Unexpected MemIndexedMode");
205   }
206 
207   InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
208                                               Align Alignment,
209                                               bool VariableMask,
210                                               bool IsGatherScatter,
211                                               TTI::TargetCostKind CostKind) {
212     // We cannot scalarize scalable vectors, so return Invalid.
213     if (isa<ScalableVectorType>(DataTy))
214       return InstructionCost::getInvalid();
215 
216     auto *VT = cast<FixedVectorType>(DataTy);
217     // Assume the target does not have support for gather/scatter operations
218     // and provide a rough estimate.
219     //
220     // First, compute the cost of the individual memory operations.
221     InstructionCost AddrExtractCost =
222         IsGatherScatter
223             ? getVectorInstrCost(Instruction::ExtractElement,
224                                  FixedVectorType::get(
225                                      PointerType::get(VT->getElementType(), 0),
226                                      VT->getNumElements()),
227                                  CostKind, -1, nullptr, nullptr)
228             : 0;
229     InstructionCost LoadCost =
230         VT->getNumElements() *
231         (AddrExtractCost +
232          getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind));
233 
234     // Next, compute the cost of packing the result in a vector.
235     InstructionCost PackingCost =
236         getScalarizationOverhead(VT, Opcode != Instruction::Store,
237                                  Opcode == Instruction::Store, CostKind);
238 
239     InstructionCost ConditionalCost = 0;
240     if (VariableMask) {
241       // Compute the cost of conditionally executing the memory operations with
242       // variable masks. This includes extracting the individual conditions, a
243       // branches and PHIs to combine the results.
244       // NOTE: Estimating the cost of conditionally executing the memory
245       // operations accurately is quite difficult and the current solution
246       // provides a very rough estimate only.
247       ConditionalCost =
248           VT->getNumElements() *
249           (getVectorInstrCost(
250                Instruction::ExtractElement,
251                FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()),
252                                     VT->getNumElements()),
253                CostKind, -1, nullptr, nullptr) +
254            getCFInstrCost(Instruction::Br, CostKind) +
255            getCFInstrCost(Instruction::PHI, CostKind));
256     }
257 
258     return LoadCost + PackingCost + ConditionalCost;
259   }
260 
261 protected:
262   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
263       : BaseT(DL) {}
264   virtual ~BasicTTIImplBase() = default;
265 
266   using TargetTransformInfoImplBase::DL;
267 
268 public:
269   /// \name Scalar TTI Implementations
270   /// @{
271   bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth,
272                                       unsigned AddressSpace, Align Alignment,
273                                       unsigned *Fast) const {
274     EVT E = EVT::getIntegerVT(Context, BitWidth);
275     return getTLI()->allowsMisalignedMemoryAccesses(
276         E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast);
277   }
278 
279   bool hasBranchDivergence(const Function *F = nullptr) { return false; }
280 
281   bool isSourceOfDivergence(const Value *V) { return false; }
282 
283   bool isAlwaysUniform(const Value *V) { return false; }
284 
285   bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
286     return false;
287   }
288 
289   bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
290     return true;
291   }
292 
293   unsigned getFlatAddressSpace() {
294     // Return an invalid address space.
295     return -1;
296   }
297 
298   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
299                                   Intrinsic::ID IID) const {
300     return false;
301   }
302 
303   bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
304     return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS);
305   }
306 
307   unsigned getAssumedAddrSpace(const Value *V) const {
308     return getTLI()->getTargetMachine().getAssumedAddrSpace(V);
309   }
310 
311   bool isSingleThreaded() const {
312     return getTLI()->getTargetMachine().Options.ThreadModel ==
313            ThreadModel::Single;
314   }
315 
316   std::pair<const Value *, unsigned>
317   getPredicatedAddrSpace(const Value *V) const {
318     return getTLI()->getTargetMachine().getPredicatedAddrSpace(V);
319   }
320 
321   Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
322                                           Value *NewV) const {
323     return nullptr;
324   }
325 
326   bool isLegalAddImmediate(int64_t imm) {
327     return getTLI()->isLegalAddImmediate(imm);
328   }
329 
330   bool isLegalICmpImmediate(int64_t imm) {
331     return getTLI()->isLegalICmpImmediate(imm);
332   }
333 
334   bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset,
335                              bool HasBaseReg, int64_t Scale,
336                              unsigned AddrSpace, Instruction *I = nullptr) {
337     TargetLoweringBase::AddrMode AM;
338     AM.BaseGV = BaseGV;
339     AM.BaseOffs = BaseOffset;
340     AM.HasBaseReg = HasBaseReg;
341     AM.Scale = Scale;
342     return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I);
343   }
344 
345   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
346                              Type *ScalarValTy) const {
347     auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) {
348       auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2);
349       EVT VT = getTLI()->getValueType(DL, SrcTy);
350       if (getTLI()->isOperationLegal(ISD::STORE, VT) ||
351           getTLI()->isOperationCustom(ISD::STORE, VT))
352         return true;
353 
354       EVT ValVT =
355           getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2));
356       EVT LegalizedVT =
357           getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT);
358       return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT);
359     };
360     while (VF > 2 && IsSupportedByTarget(VF))
361       VF /= 2;
362     return VF;
363   }
364 
365   bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty,
366                           const DataLayout &DL) const {
367     EVT VT = getTLI()->getValueType(DL, Ty);
368     return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT);
369   }
370 
371   bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty,
372                            const DataLayout &DL) const {
373     EVT VT = getTLI()->getValueType(DL, Ty);
374     return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT);
375   }
376 
377   bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) {
378     return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
379   }
380 
381   bool isNumRegsMajorCostOfLSR() {
382     return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
383   }
384 
385   bool isProfitableLSRChainElement(Instruction *I) {
386     return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
387   }
388 
389   InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
390                                        int64_t BaseOffset, bool HasBaseReg,
391                                        int64_t Scale, unsigned AddrSpace) {
392     TargetLoweringBase::AddrMode AM;
393     AM.BaseGV = BaseGV;
394     AM.BaseOffs = BaseOffset;
395     AM.HasBaseReg = HasBaseReg;
396     AM.Scale = Scale;
397     if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
398       return 0;
399     return -1;
400   }
401 
402   bool isTruncateFree(Type *Ty1, Type *Ty2) {
403     return getTLI()->isTruncateFree(Ty1, Ty2);
404   }
405 
406   bool isProfitableToHoist(Instruction *I) {
407     return getTLI()->isProfitableToHoist(I);
408   }
409 
410   bool useAA() const { return getST()->useAA(); }
411 
412   bool isTypeLegal(Type *Ty) {
413     EVT VT = getTLI()->getValueType(DL, Ty);
414     return getTLI()->isTypeLegal(VT);
415   }
416 
417   unsigned getRegUsageForType(Type *Ty) {
418     EVT ETy = getTLI()->getValueType(DL, Ty);
419     return getTLI()->getNumRegisters(Ty->getContext(), ETy);
420   }
421 
422   InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr,
423                              ArrayRef<const Value *> Operands, Type *AccessType,
424                              TTI::TargetCostKind CostKind) {
425     return BaseT::getGEPCost(PointeeType, Ptr, Operands, AccessType, CostKind);
426   }
427 
428   unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI,
429                                             unsigned &JumpTableSize,
430                                             ProfileSummaryInfo *PSI,
431                                             BlockFrequencyInfo *BFI) {
432     /// Try to find the estimated number of clusters. Note that the number of
433     /// clusters identified in this function could be different from the actual
434     /// numbers found in lowering. This function ignore switches that are
435     /// lowered with a mix of jump table / bit test / BTree. This function was
436     /// initially intended to be used when estimating the cost of switch in
437     /// inline cost heuristic, but it's a generic cost model to be used in other
438     /// places (e.g., in loop unrolling).
439     unsigned N = SI.getNumCases();
440     const TargetLoweringBase *TLI = getTLI();
441     const DataLayout &DL = this->getDataLayout();
442 
443     JumpTableSize = 0;
444     bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent());
445 
446     // Early exit if both a jump table and bit test are not allowed.
447     if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N))
448       return N;
449 
450     APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue();
451     APInt MinCaseVal = MaxCaseVal;
452     for (auto CI : SI.cases()) {
453       const APInt &CaseVal = CI.getCaseValue()->getValue();
454       if (CaseVal.sgt(MaxCaseVal))
455         MaxCaseVal = CaseVal;
456       if (CaseVal.slt(MinCaseVal))
457         MinCaseVal = CaseVal;
458     }
459 
460     // Check if suitable for a bit test
461     if (N <= DL.getIndexSizeInBits(0u)) {
462       SmallPtrSet<const BasicBlock *, 4> Dests;
463       for (auto I : SI.cases())
464         Dests.insert(I.getCaseSuccessor());
465 
466       if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal,
467                                      DL))
468         return 1;
469     }
470 
471     // Check if suitable for a jump table.
472     if (IsJTAllowed) {
473       if (N < 2 || N < TLI->getMinimumJumpTableEntries())
474         return N;
475       uint64_t Range =
476           (MaxCaseVal - MinCaseVal)
477               .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1;
478       // Check whether a range of clusters is dense enough for a jump table
479       if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) {
480         JumpTableSize = Range;
481         return 1;
482       }
483     }
484     return N;
485   }
486 
487   bool shouldBuildLookupTables() {
488     const TargetLoweringBase *TLI = getTLI();
489     return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
490            TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
491   }
492 
493   bool shouldBuildRelLookupTables() const {
494     const TargetMachine &TM = getTLI()->getTargetMachine();
495     // If non-PIC mode, do not generate a relative lookup table.
496     if (!TM.isPositionIndependent())
497       return false;
498 
499     /// Relative lookup table entries consist of 32-bit offsets.
500     /// Do not generate relative lookup tables for large code models
501     /// in 64-bit achitectures where 32-bit offsets might not be enough.
502     if (TM.getCodeModel() == CodeModel::Medium ||
503         TM.getCodeModel() == CodeModel::Large)
504       return false;
505 
506     Triple TargetTriple = TM.getTargetTriple();
507     if (!TargetTriple.isArch64Bit())
508       return false;
509 
510     // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it
511     // there.
512     if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin())
513       return false;
514 
515     return true;
516   }
517 
518   bool haveFastSqrt(Type *Ty) {
519     const TargetLoweringBase *TLI = getTLI();
520     EVT VT = TLI->getValueType(DL, Ty);
521     return TLI->isTypeLegal(VT) &&
522            TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
523   }
524 
525   bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
526     return true;
527   }
528 
529   InstructionCost getFPOpCost(Type *Ty) {
530     // Check whether FADD is available, as a proxy for floating-point in
531     // general.
532     const TargetLoweringBase *TLI = getTLI();
533     EVT VT = TLI->getValueType(DL, Ty);
534     if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT))
535       return TargetTransformInfo::TCC_Basic;
536     return TargetTransformInfo::TCC_Expensive;
537   }
538 
539   unsigned getInliningThresholdMultiplier() const { return 1; }
540   unsigned adjustInliningThreshold(const CallBase *CB) { return 0; }
541   unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
542     return 0;
543   }
544 
545   int getInlinerVectorBonusPercent() const { return 150; }
546 
547   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
548                                TTI::UnrollingPreferences &UP,
549                                OptimizationRemarkEmitter *ORE) {
550     // This unrolling functionality is target independent, but to provide some
551     // motivation for its intended use, for x86:
552 
553     // According to the Intel 64 and IA-32 Architectures Optimization Reference
554     // Manual, Intel Core models and later have a loop stream detector (and
555     // associated uop queue) that can benefit from partial unrolling.
556     // The relevant requirements are:
557     //  - The loop must have no more than 4 (8 for Nehalem and later) branches
558     //    taken, and none of them may be calls.
559     //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
560 
561     // According to the Software Optimization Guide for AMD Family 15h
562     // Processors, models 30h-4fh (Steamroller and later) have a loop predictor
563     // and loop buffer which can benefit from partial unrolling.
564     // The relevant requirements are:
565     //  - The loop must have fewer than 16 branches
566     //  - The loop must have less than 40 uops in all executed loop branches
567 
568     // The number of taken branches in a loop is hard to estimate here, and
569     // benchmarking has revealed that it is better not to be conservative when
570     // estimating the branch count. As a result, we'll ignore the branch limits
571     // until someone finds a case where it matters in practice.
572 
573     unsigned MaxOps;
574     const TargetSubtargetInfo *ST = getST();
575     if (PartialUnrollingThreshold.getNumOccurrences() > 0)
576       MaxOps = PartialUnrollingThreshold;
577     else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
578       MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
579     else
580       return;
581 
582     // Scan the loop: don't unroll loops with calls.
583     for (BasicBlock *BB : L->blocks()) {
584       for (Instruction &I : *BB) {
585         if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
586           if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
587             if (!thisT()->isLoweredToCall(F))
588               continue;
589           }
590 
591           if (ORE) {
592             ORE->emit([&]() {
593               return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(),
594                                         L->getHeader())
595                      << "advising against unrolling the loop because it "
596                         "contains a "
597                      << ore::NV("Call", &I);
598             });
599           }
600           return;
601         }
602       }
603     }
604 
605     // Enable runtime and partial unrolling up to the specified size.
606     // Enable using trip count upper bound to unroll loops.
607     UP.Partial = UP.Runtime = UP.UpperBound = true;
608     UP.PartialThreshold = MaxOps;
609 
610     // Avoid unrolling when optimizing for size.
611     UP.OptSizeThreshold = 0;
612     UP.PartialOptSizeThreshold = 0;
613 
614     // Set number of instructions optimized when "back edge"
615     // becomes "fall through" to default value of 2.
616     UP.BEInsns = 2;
617   }
618 
619   void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
620                              TTI::PeelingPreferences &PP) {
621     PP.PeelCount = 0;
622     PP.AllowPeeling = true;
623     PP.AllowLoopNestsPeeling = false;
624     PP.PeelProfiledIterations = true;
625   }
626 
627   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
628                                 AssumptionCache &AC,
629                                 TargetLibraryInfo *LibInfo,
630                                 HardwareLoopInfo &HWLoopInfo) {
631     return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
632   }
633 
634   bool preferPredicateOverEpilogue(TailFoldingInfo *TFI) {
635     return BaseT::preferPredicateOverEpilogue(TFI);
636   }
637 
638   TailFoldingStyle
639   getPreferredTailFoldingStyle(bool IVUpdateMayOverflow = true) {
640     return BaseT::getPreferredTailFoldingStyle(IVUpdateMayOverflow);
641   }
642 
643   std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
644                                                IntrinsicInst &II) {
645     return BaseT::instCombineIntrinsic(IC, II);
646   }
647 
648   std::optional<Value *>
649   simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II,
650                                    APInt DemandedMask, KnownBits &Known,
651                                    bool &KnownBitsComputed) {
652     return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known,
653                                                    KnownBitsComputed);
654   }
655 
656   std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
657       InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
658       APInt &UndefElts2, APInt &UndefElts3,
659       std::function<void(Instruction *, unsigned, APInt, APInt &)>
660           SimplifyAndSetOp) {
661     return BaseT::simplifyDemandedVectorEltsIntrinsic(
662         IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
663         SimplifyAndSetOp);
664   }
665 
666   virtual std::optional<unsigned>
667   getCacheSize(TargetTransformInfo::CacheLevel Level) const {
668     return std::optional<unsigned>(
669         getST()->getCacheSize(static_cast<unsigned>(Level)));
670   }
671 
672   virtual std::optional<unsigned>
673   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const {
674     std::optional<unsigned> TargetResult =
675         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
676 
677     if (TargetResult)
678       return TargetResult;
679 
680     return BaseT::getCacheAssociativity(Level);
681   }
682 
683   virtual unsigned getCacheLineSize() const {
684     return getST()->getCacheLineSize();
685   }
686 
687   virtual unsigned getPrefetchDistance() const {
688     return getST()->getPrefetchDistance();
689   }
690 
691   virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
692                                         unsigned NumStridedMemAccesses,
693                                         unsigned NumPrefetches,
694                                         bool HasCall) const {
695     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
696                                          NumPrefetches, HasCall);
697   }
698 
699   virtual unsigned getMaxPrefetchIterationsAhead() const {
700     return getST()->getMaxPrefetchIterationsAhead();
701   }
702 
703   virtual bool enableWritePrefetching() const {
704     return getST()->enableWritePrefetching();
705   }
706 
707   virtual bool shouldPrefetchAddressSpace(unsigned AS) const {
708     return getST()->shouldPrefetchAddressSpace(AS);
709   }
710 
711   /// @}
712 
713   /// \name Vector TTI Implementations
714   /// @{
715 
716   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
717     return TypeSize::getFixed(32);
718   }
719 
720   std::optional<unsigned> getMaxVScale() const { return std::nullopt; }
721   std::optional<unsigned> getVScaleForTuning() const { return std::nullopt; }
722   bool isVScaleKnownToBeAPowerOfTwo() const { return false; }
723 
724   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
725   /// are set if the demanded result elements need to be inserted and/or
726   /// extracted from vectors.
727   InstructionCost getScalarizationOverhead(VectorType *InTy,
728                                            const APInt &DemandedElts,
729                                            bool Insert, bool Extract,
730                                            TTI::TargetCostKind CostKind) {
731     /// FIXME: a bitfield is not a reasonable abstraction for talking about
732     /// which elements are needed from a scalable vector
733     if (isa<ScalableVectorType>(InTy))
734       return InstructionCost::getInvalid();
735     auto *Ty = cast<FixedVectorType>(InTy);
736 
737     assert(DemandedElts.getBitWidth() == Ty->getNumElements() &&
738            "Vector size mismatch");
739 
740     InstructionCost Cost = 0;
741 
742     for (int i = 0, e = Ty->getNumElements(); i < e; ++i) {
743       if (!DemandedElts[i])
744         continue;
745       if (Insert)
746         Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty,
747                                             CostKind, i, nullptr, nullptr);
748       if (Extract)
749         Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
750                                             CostKind, i, nullptr, nullptr);
751     }
752 
753     return Cost;
754   }
755 
756   /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead.
757   InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert,
758                                            bool Extract,
759                                            TTI::TargetCostKind CostKind) {
760     if (isa<ScalableVectorType>(InTy))
761       return InstructionCost::getInvalid();
762     auto *Ty = cast<FixedVectorType>(InTy);
763 
764     APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements());
765     return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
766                                              CostKind);
767   }
768 
769   /// Estimate the overhead of scalarizing an instructions unique
770   /// non-constant operands. The (potentially vector) types to use for each of
771   /// argument are passes via Tys.
772   InstructionCost
773   getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
774                                    ArrayRef<Type *> Tys,
775                                    TTI::TargetCostKind CostKind) {
776     assert(Args.size() == Tys.size() && "Expected matching Args and Tys");
777 
778     InstructionCost Cost = 0;
779     SmallPtrSet<const Value*, 4> UniqueOperands;
780     for (int I = 0, E = Args.size(); I != E; I++) {
781       // Disregard things like metadata arguments.
782       const Value *A = Args[I];
783       Type *Ty = Tys[I];
784       if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() &&
785           !Ty->isPtrOrPtrVectorTy())
786         continue;
787 
788       if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
789         if (auto *VecTy = dyn_cast<VectorType>(Ty))
790           Cost += getScalarizationOverhead(VecTy, /*Insert*/ false,
791                                            /*Extract*/ true, CostKind);
792       }
793     }
794 
795     return Cost;
796   }
797 
798   /// Estimate the overhead of scalarizing the inputs and outputs of an
799   /// instruction, with return type RetTy and arguments Args of type Tys. If
800   /// Args are unknown (empty), then the cost associated with one argument is
801   /// added as a heuristic.
802   InstructionCost getScalarizationOverhead(VectorType *RetTy,
803                                            ArrayRef<const Value *> Args,
804                                            ArrayRef<Type *> Tys,
805                                            TTI::TargetCostKind CostKind) {
806     InstructionCost Cost = getScalarizationOverhead(
807         RetTy, /*Insert*/ true, /*Extract*/ false, CostKind);
808     if (!Args.empty())
809       Cost += getOperandsScalarizationOverhead(Args, Tys, CostKind);
810     else
811       // When no information on arguments is provided, we add the cost
812       // associated with one argument as a heuristic.
813       Cost += getScalarizationOverhead(RetTy, /*Insert*/ false,
814                                        /*Extract*/ true, CostKind);
815 
816     return Cost;
817   }
818 
819   /// Estimate the cost of type-legalization and the legalized type.
820   std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const {
821     LLVMContext &C = Ty->getContext();
822     EVT MTy = getTLI()->getValueType(DL, Ty);
823 
824     InstructionCost Cost = 1;
825     // We keep legalizing the type until we find a legal kind. We assume that
826     // the only operation that costs anything is the split. After splitting
827     // we need to handle two types.
828     while (true) {
829       TargetLoweringBase::LegalizeKind LK = getTLI()->getTypeConversion(C, MTy);
830 
831       if (LK.first == TargetLoweringBase::TypeScalarizeScalableVector) {
832         // Ensure we return a sensible simple VT here, since many callers of
833         // this function require it.
834         MVT VT = MTy.isSimple() ? MTy.getSimpleVT() : MVT::i64;
835         return std::make_pair(InstructionCost::getInvalid(), VT);
836       }
837 
838       if (LK.first == TargetLoweringBase::TypeLegal)
839         return std::make_pair(Cost, MTy.getSimpleVT());
840 
841       if (LK.first == TargetLoweringBase::TypeSplitVector ||
842           LK.first == TargetLoweringBase::TypeExpandInteger)
843         Cost *= 2;
844 
845       // Do not loop with f128 type.
846       if (MTy == LK.second)
847         return std::make_pair(Cost, MTy.getSimpleVT());
848 
849       // Keep legalizing the type.
850       MTy = LK.second;
851     }
852   }
853 
854   unsigned getMaxInterleaveFactor(ElementCount VF) { return 1; }
855 
856   InstructionCost getArithmeticInstrCost(
857       unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
858       TTI::OperandValueInfo Opd1Info = {TTI::OK_AnyValue, TTI::OP_None},
859       TTI::OperandValueInfo Opd2Info = {TTI::OK_AnyValue, TTI::OP_None},
860       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
861       const Instruction *CxtI = nullptr) {
862     // Check if any of the operands are vector operands.
863     const TargetLoweringBase *TLI = getTLI();
864     int ISD = TLI->InstructionOpcodeToISD(Opcode);
865     assert(ISD && "Invalid opcode");
866 
867     // TODO: Handle more cost kinds.
868     if (CostKind != TTI::TCK_RecipThroughput)
869       return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind,
870                                            Opd1Info, Opd2Info,
871                                            Args, CxtI);
872 
873     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
874 
875     bool IsFloat = Ty->isFPOrFPVectorTy();
876     // Assume that floating point arithmetic operations cost twice as much as
877     // integer operations.
878     InstructionCost OpCost = (IsFloat ? 2 : 1);
879 
880     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
881       // The operation is legal. Assume it costs 1.
882       // TODO: Once we have extract/insert subvector cost we need to use them.
883       return LT.first * OpCost;
884     }
885 
886     if (!TLI->isOperationExpand(ISD, LT.second)) {
887       // If the operation is custom lowered, then assume that the code is twice
888       // as expensive.
889       return LT.first * 2 * OpCost;
890     }
891 
892     // An 'Expand' of URem and SRem is special because it may default
893     // to expanding the operation into a sequence of sub-operations
894     // i.e. X % Y -> X-(X/Y)*Y.
895     if (ISD == ISD::UREM || ISD == ISD::SREM) {
896       bool IsSigned = ISD == ISD::SREM;
897       if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM,
898                                         LT.second) ||
899           TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV,
900                                         LT.second)) {
901         unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv;
902         InstructionCost DivCost = thisT()->getArithmeticInstrCost(
903             DivOpc, Ty, CostKind, Opd1Info, Opd2Info);
904         InstructionCost MulCost =
905             thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind);
906         InstructionCost SubCost =
907             thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind);
908         return DivCost + MulCost + SubCost;
909       }
910     }
911 
912     // We cannot scalarize scalable vectors, so return Invalid.
913     if (isa<ScalableVectorType>(Ty))
914       return InstructionCost::getInvalid();
915 
916     // Else, assume that we need to scalarize this op.
917     // TODO: If one of the types get legalized by splitting, handle this
918     // similarly to what getCastInstrCost() does.
919     if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
920       InstructionCost Cost = thisT()->getArithmeticInstrCost(
921           Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
922           Args, CxtI);
923       // Return the cost of multiple scalar invocation plus the cost of
924       // inserting and extracting the values.
925       SmallVector<Type *> Tys(Args.size(), Ty);
926       return getScalarizationOverhead(VTy, Args, Tys, CostKind) +
927              VTy->getNumElements() * Cost;
928     }
929 
930     // We don't know anything about this scalar instruction.
931     return OpCost;
932   }
933 
934   TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind,
935                                               ArrayRef<int> Mask) const {
936     int Limit = Mask.size() * 2;
937     if (Mask.empty() ||
938         // Extra check required by isSingleSourceMaskImpl function (called by
939         // ShuffleVectorInst::isSingleSourceMask).
940         any_of(Mask, [Limit](int I) { return I >= Limit; }))
941       return Kind;
942     int Index;
943     switch (Kind) {
944     case TTI::SK_PermuteSingleSrc:
945       if (ShuffleVectorInst::isReverseMask(Mask))
946         return TTI::SK_Reverse;
947       if (ShuffleVectorInst::isZeroEltSplatMask(Mask))
948         return TTI::SK_Broadcast;
949       break;
950     case TTI::SK_PermuteTwoSrc:
951       if (ShuffleVectorInst::isSelectMask(Mask))
952         return TTI::SK_Select;
953       if (ShuffleVectorInst::isTransposeMask(Mask))
954         return TTI::SK_Transpose;
955       if (ShuffleVectorInst::isSpliceMask(Mask, Index))
956         return TTI::SK_Splice;
957       break;
958     case TTI::SK_Select:
959     case TTI::SK_Reverse:
960     case TTI::SK_Broadcast:
961     case TTI::SK_Transpose:
962     case TTI::SK_InsertSubvector:
963     case TTI::SK_ExtractSubvector:
964     case TTI::SK_Splice:
965       break;
966     }
967     return Kind;
968   }
969 
970   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
971                                  ArrayRef<int> Mask,
972                                  TTI::TargetCostKind CostKind, int Index,
973                                  VectorType *SubTp,
974                                  ArrayRef<const Value *> Args = std::nullopt) {
975 
976     switch (improveShuffleKindFromMask(Kind, Mask)) {
977     case TTI::SK_Broadcast:
978       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
979         return getBroadcastShuffleOverhead(FVT, CostKind);
980       return InstructionCost::getInvalid();
981     case TTI::SK_Select:
982     case TTI::SK_Splice:
983     case TTI::SK_Reverse:
984     case TTI::SK_Transpose:
985     case TTI::SK_PermuteSingleSrc:
986     case TTI::SK_PermuteTwoSrc:
987       if (auto *FVT = dyn_cast<FixedVectorType>(Tp))
988         return getPermuteShuffleOverhead(FVT, CostKind);
989       return InstructionCost::getInvalid();
990     case TTI::SK_ExtractSubvector:
991       return getExtractSubvectorOverhead(Tp, CostKind, Index,
992                                          cast<FixedVectorType>(SubTp));
993     case TTI::SK_InsertSubvector:
994       return getInsertSubvectorOverhead(Tp, CostKind, Index,
995                                         cast<FixedVectorType>(SubTp));
996     }
997     llvm_unreachable("Unknown TTI::ShuffleKind");
998   }
999 
1000   InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
1001                                    TTI::CastContextHint CCH,
1002                                    TTI::TargetCostKind CostKind,
1003                                    const Instruction *I = nullptr) {
1004     if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0)
1005       return 0;
1006 
1007     const TargetLoweringBase *TLI = getTLI();
1008     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1009     assert(ISD && "Invalid opcode");
1010     std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1011     std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1012 
1013     TypeSize SrcSize = SrcLT.second.getSizeInBits();
1014     TypeSize DstSize = DstLT.second.getSizeInBits();
1015     bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy();
1016     bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy();
1017 
1018     switch (Opcode) {
1019     default:
1020       break;
1021     case Instruction::Trunc:
1022       // Check for NOOP conversions.
1023       if (TLI->isTruncateFree(SrcLT.second, DstLT.second))
1024         return 0;
1025       [[fallthrough]];
1026     case Instruction::BitCast:
1027       // Bitcast between types that are legalized to the same type are free and
1028       // assume int to/from ptr of the same size is also free.
1029       if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst &&
1030           SrcSize == DstSize)
1031         return 0;
1032       break;
1033     case Instruction::FPExt:
1034       if (I && getTLI()->isExtFree(I))
1035         return 0;
1036       break;
1037     case Instruction::ZExt:
1038       if (TLI->isZExtFree(SrcLT.second, DstLT.second))
1039         return 0;
1040       [[fallthrough]];
1041     case Instruction::SExt:
1042       if (I && getTLI()->isExtFree(I))
1043         return 0;
1044 
1045       // If this is a zext/sext of a load, return 0 if the corresponding
1046       // extending load exists on target and the result type is legal.
1047       if (CCH == TTI::CastContextHint::Normal) {
1048         EVT ExtVT = EVT::getEVT(Dst);
1049         EVT LoadVT = EVT::getEVT(Src);
1050         unsigned LType =
1051           ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
1052         if (DstLT.first == SrcLT.first &&
1053             TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
1054           return 0;
1055       }
1056       break;
1057     case Instruction::AddrSpaceCast:
1058       if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(),
1059                                    Dst->getPointerAddressSpace()))
1060         return 0;
1061       break;
1062     }
1063 
1064     auto *SrcVTy = dyn_cast<VectorType>(Src);
1065     auto *DstVTy = dyn_cast<VectorType>(Dst);
1066 
1067     // If the cast is marked as legal (or promote) then assume low cost.
1068     if (SrcLT.first == DstLT.first &&
1069         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
1070       return SrcLT.first;
1071 
1072     // Handle scalar conversions.
1073     if (!SrcVTy && !DstVTy) {
1074       // Just check the op cost. If the operation is legal then assume it costs
1075       // 1.
1076       if (!TLI->isOperationExpand(ISD, DstLT.second))
1077         return 1;
1078 
1079       // Assume that illegal scalar instruction are expensive.
1080       return 4;
1081     }
1082 
1083     // Check vector-to-vector casts.
1084     if (DstVTy && SrcVTy) {
1085       // If the cast is between same-sized registers, then the check is simple.
1086       if (SrcLT.first == DstLT.first && SrcSize == DstSize) {
1087 
1088         // Assume that Zext is done using AND.
1089         if (Opcode == Instruction::ZExt)
1090           return SrcLT.first;
1091 
1092         // Assume that sext is done using SHL and SRA.
1093         if (Opcode == Instruction::SExt)
1094           return SrcLT.first * 2;
1095 
1096         // Just check the op cost. If the operation is legal then assume it
1097         // costs
1098         // 1 and multiply by the type-legalization overhead.
1099         if (!TLI->isOperationExpand(ISD, DstLT.second))
1100           return SrcLT.first * 1;
1101       }
1102 
1103       // If we are legalizing by splitting, query the concrete TTI for the cost
1104       // of casting the original vector twice. We also need to factor in the
1105       // cost of the split itself. Count that as 1, to be consistent with
1106       // getTypeLegalizationCost().
1107       bool SplitSrc =
1108           TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) ==
1109           TargetLowering::TypeSplitVector;
1110       bool SplitDst =
1111           TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) ==
1112           TargetLowering::TypeSplitVector;
1113       if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() &&
1114           DstVTy->getElementCount().isVector()) {
1115         Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy);
1116         Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy);
1117         T *TTI = static_cast<T *>(this);
1118         // If both types need to be split then the split is free.
1119         InstructionCost SplitCost =
1120             (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0;
1121         return SplitCost +
1122                (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH,
1123                                           CostKind, I));
1124       }
1125 
1126       // Scalarization cost is Invalid, can't assume any num elements.
1127       if (isa<ScalableVectorType>(DstVTy))
1128         return InstructionCost::getInvalid();
1129 
1130       // In other cases where the source or destination are illegal, assume
1131       // the operation will get scalarized.
1132       unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements();
1133       InstructionCost Cost = thisT()->getCastInstrCost(
1134           Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I);
1135 
1136       // Return the cost of multiple scalar invocation plus the cost of
1137       // inserting and extracting the values.
1138       return getScalarizationOverhead(DstVTy, /*Insert*/ true, /*Extract*/ true,
1139                                       CostKind) +
1140              Num * Cost;
1141     }
1142 
1143     // We already handled vector-to-vector and scalar-to-scalar conversions.
1144     // This
1145     // is where we handle bitcast between vectors and scalars. We need to assume
1146     //  that the conversion is scalarized in one way or another.
1147     if (Opcode == Instruction::BitCast) {
1148       // Illegal bitcasts are done by storing and loading from a stack slot.
1149       return (SrcVTy ? getScalarizationOverhead(SrcVTy, /*Insert*/ false,
1150                                                 /*Extract*/ true, CostKind)
1151                      : 0) +
1152              (DstVTy ? getScalarizationOverhead(DstVTy, /*Insert*/ true,
1153                                                 /*Extract*/ false, CostKind)
1154                      : 0);
1155     }
1156 
1157     llvm_unreachable("Unhandled cast");
1158   }
1159 
1160   InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst,
1161                                            VectorType *VecTy, unsigned Index) {
1162     TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
1163     return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy,
1164                                        CostKind, Index, nullptr, nullptr) +
1165            thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(),
1166                                      TTI::CastContextHint::None, CostKind);
1167   }
1168 
1169   InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
1170                                  const Instruction *I = nullptr) {
1171     return BaseT::getCFInstrCost(Opcode, CostKind, I);
1172   }
1173 
1174   InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
1175                                      CmpInst::Predicate VecPred,
1176                                      TTI::TargetCostKind CostKind,
1177                                      const Instruction *I = nullptr) {
1178     const TargetLoweringBase *TLI = getTLI();
1179     int ISD = TLI->InstructionOpcodeToISD(Opcode);
1180     assert(ISD && "Invalid opcode");
1181 
1182     // TODO: Handle other cost kinds.
1183     if (CostKind != TTI::TCK_RecipThroughput)
1184       return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
1185                                        I);
1186 
1187     // Selects on vectors are actually vector selects.
1188     if (ISD == ISD::SELECT) {
1189       assert(CondTy && "CondTy must exist");
1190       if (CondTy->isVectorTy())
1191         ISD = ISD::VSELECT;
1192     }
1193     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1194 
1195     if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
1196         !TLI->isOperationExpand(ISD, LT.second)) {
1197       // The operation is legal. Assume it costs 1. Multiply
1198       // by the type-legalization overhead.
1199       return LT.first * 1;
1200     }
1201 
1202     // Otherwise, assume that the cast is scalarized.
1203     // TODO: If one of the types get legalized by splitting, handle this
1204     // similarly to what getCastInstrCost() does.
1205     if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
1206       if (isa<ScalableVectorType>(ValTy))
1207         return InstructionCost::getInvalid();
1208 
1209       unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements();
1210       if (CondTy)
1211         CondTy = CondTy->getScalarType();
1212       InstructionCost Cost = thisT()->getCmpSelInstrCost(
1213           Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I);
1214 
1215       // Return the cost of multiple scalar invocation plus the cost of
1216       // inserting and extracting the values.
1217       return getScalarizationOverhead(ValVTy, /*Insert*/ true,
1218                                       /*Extract*/ false, CostKind) +
1219              Num * Cost;
1220     }
1221 
1222     // Unknown scalar opcode.
1223     return 1;
1224   }
1225 
1226   InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val,
1227                                      TTI::TargetCostKind CostKind,
1228                                      unsigned Index, Value *Op0, Value *Op1) {
1229     return getRegUsageForType(Val->getScalarType());
1230   }
1231 
1232   InstructionCost getVectorInstrCost(const Instruction &I, Type *Val,
1233                                      TTI::TargetCostKind CostKind,
1234                                      unsigned Index) {
1235     Value *Op0 = nullptr;
1236     Value *Op1 = nullptr;
1237     if (auto *IE = dyn_cast<InsertElementInst>(&I)) {
1238       Op0 = IE->getOperand(0);
1239       Op1 = IE->getOperand(1);
1240     }
1241     return thisT()->getVectorInstrCost(I.getOpcode(), Val, CostKind, Index, Op0,
1242                                        Op1);
1243   }
1244 
1245   InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor,
1246                                             int VF,
1247                                             const APInt &DemandedDstElts,
1248                                             TTI::TargetCostKind CostKind) {
1249     assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor &&
1250            "Unexpected size of DemandedDstElts.");
1251 
1252     InstructionCost Cost;
1253 
1254     auto *SrcVT = FixedVectorType::get(EltTy, VF);
1255     auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor);
1256 
1257     // The Mask shuffling cost is extract all the elements of the Mask
1258     // and insert each of them Factor times into the wide vector:
1259     //
1260     // E.g. an interleaved group with factor 3:
1261     //    %mask = icmp ult <8 x i32> %vec1, %vec2
1262     //    %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef,
1263     //        <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7>
1264     // The cost is estimated as extract all mask elements from the <8xi1> mask
1265     // vector and insert them factor times into the <24xi1> shuffled mask
1266     // vector.
1267     APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF);
1268     Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts,
1269                                               /*Insert*/ false,
1270                                               /*Extract*/ true, CostKind);
1271     Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts,
1272                                               /*Insert*/ true,
1273                                               /*Extract*/ false, CostKind);
1274 
1275     return Cost;
1276   }
1277 
1278   InstructionCost
1279   getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
1280                   unsigned AddressSpace, TTI::TargetCostKind CostKind,
1281                   TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
1282                   const Instruction *I = nullptr) {
1283     assert(!Src->isVoidTy() && "Invalid type");
1284     // Assume types, such as structs, are expensive.
1285     if (getTLI()->getValueType(DL, Src,  true) == MVT::Other)
1286       return 4;
1287     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1288 
1289     // Assuming that all loads of legal types cost 1.
1290     InstructionCost Cost = LT.first;
1291     if (CostKind != TTI::TCK_RecipThroughput)
1292       return Cost;
1293 
1294     const DataLayout &DL = this->getDataLayout();
1295     if (Src->isVectorTy() &&
1296         // In practice it's not currently possible to have a change in lane
1297         // length for extending loads or truncating stores so both types should
1298         // have the same scalable property.
1299         TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
1300                             LT.second.getSizeInBits())) {
1301       // This is a vector load that legalizes to a larger type than the vector
1302       // itself. Unless the corresponding extending load or truncating store is
1303       // legal, then this will scalarize.
1304       TargetLowering::LegalizeAction LA = TargetLowering::Expand;
1305       EVT MemVT = getTLI()->getValueType(DL, Src);
1306       if (Opcode == Instruction::Store)
1307         LA = getTLI()->getTruncStoreAction(LT.second, MemVT);
1308       else
1309         LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT);
1310 
1311       if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
1312         // This is a vector load/store for some illegal type that is scalarized.
1313         // We must account for the cost of building or decomposing the vector.
1314         Cost += getScalarizationOverhead(
1315             cast<VectorType>(Src), Opcode != Instruction::Store,
1316             Opcode == Instruction::Store, CostKind);
1317       }
1318     }
1319 
1320     return Cost;
1321   }
1322 
1323   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy,
1324                                         Align Alignment, unsigned AddressSpace,
1325                                         TTI::TargetCostKind CostKind) {
1326     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
1327                                        CostKind);
1328   }
1329 
1330   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
1331                                          const Value *Ptr, bool VariableMask,
1332                                          Align Alignment,
1333                                          TTI::TargetCostKind CostKind,
1334                                          const Instruction *I = nullptr) {
1335     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask,
1336                                        true, CostKind);
1337   }
1338 
1339   InstructionCost getInterleavedMemoryOpCost(
1340       unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1341       Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1342       bool UseMaskForCond = false, bool UseMaskForGaps = false) {
1343 
1344     // We cannot scalarize scalable vectors, so return Invalid.
1345     if (isa<ScalableVectorType>(VecTy))
1346       return InstructionCost::getInvalid();
1347 
1348     auto *VT = cast<FixedVectorType>(VecTy);
1349 
1350     unsigned NumElts = VT->getNumElements();
1351     assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
1352 
1353     unsigned NumSubElts = NumElts / Factor;
1354     auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts);
1355 
1356     // Firstly, the cost of load/store operation.
1357     InstructionCost Cost;
1358     if (UseMaskForCond || UseMaskForGaps)
1359       Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
1360                                             AddressSpace, CostKind);
1361     else
1362       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
1363                                       CostKind);
1364 
1365     // Legalize the vector type, and get the legalized and unlegalized type
1366     // sizes.
1367     MVT VecTyLT = getTypeLegalizationCost(VecTy).second;
1368     unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy);
1369     unsigned VecTyLTSize = VecTyLT.getStoreSize();
1370 
1371     // Scale the cost of the memory operation by the fraction of legalized
1372     // instructions that will actually be used. We shouldn't account for the
1373     // cost of dead instructions since they will be removed.
1374     //
1375     // E.g., An interleaved load of factor 8:
1376     //       %vec = load <16 x i64>, <16 x i64>* %ptr
1377     //       %v0 = shufflevector %vec, undef, <0, 8>
1378     //
1379     // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be
1380     // used (those corresponding to elements [0:1] and [8:9] of the unlegalized
1381     // type). The other loads are unused.
1382     //
1383     // TODO: Note that legalization can turn masked loads/stores into unmasked
1384     // (legalized) loads/stores. This can be reflected in the cost.
1385     if (Cost.isValid() && VecTySize > VecTyLTSize) {
1386       // The number of loads of a legal type it will take to represent a load
1387       // of the unlegalized vector type.
1388       unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize);
1389 
1390       // The number of elements of the unlegalized type that correspond to a
1391       // single legal instruction.
1392       unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts);
1393 
1394       // Determine which legal instructions will be used.
1395       BitVector UsedInsts(NumLegalInsts, false);
1396       for (unsigned Index : Indices)
1397         for (unsigned Elt = 0; Elt < NumSubElts; ++Elt)
1398           UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst);
1399 
1400       // Scale the cost of the load by the fraction of legal instructions that
1401       // will be used.
1402       Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts);
1403     }
1404 
1405     // Then plus the cost of interleave operation.
1406     assert(Indices.size() <= Factor &&
1407            "Interleaved memory op has too many members");
1408 
1409     const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts);
1410     const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts);
1411 
1412     APInt DemandedLoadStoreElts = APInt::getZero(NumElts);
1413     for (unsigned Index : Indices) {
1414       assert(Index < Factor && "Invalid index for interleaved memory op");
1415       for (unsigned Elm = 0; Elm < NumSubElts; Elm++)
1416         DemandedLoadStoreElts.setBit(Index + Elm * Factor);
1417     }
1418 
1419     if (Opcode == Instruction::Load) {
1420       // The interleave cost is similar to extract sub vectors' elements
1421       // from the wide vector, and insert them into sub vectors.
1422       //
1423       // E.g. An interleaved load of factor 2 (with one member of index 0):
1424       //      %vec = load <8 x i32>, <8 x i32>* %ptr
1425       //      %v0 = shuffle %vec, undef, <0, 2, 4, 6>         ; Index 0
1426       // The cost is estimated as extract elements at 0, 2, 4, 6 from the
1427       // <8 x i32> vector and insert them into a <4 x i32> vector.
1428       InstructionCost InsSubCost = thisT()->getScalarizationOverhead(
1429           SubVT, DemandedAllSubElts,
1430           /*Insert*/ true, /*Extract*/ false, CostKind);
1431       Cost += Indices.size() * InsSubCost;
1432       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1433                                                 /*Insert*/ false,
1434                                                 /*Extract*/ true, CostKind);
1435     } else {
1436       // The interleave cost is extract elements from sub vectors, and
1437       // insert them into the wide vector.
1438       //
1439       // E.g. An interleaved store of factor 3 with 2 members at indices 0,1:
1440       // (using VF=4):
1441       //    %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef>
1442       //    %gaps.mask = <true, true, false, true, true, false,
1443       //                  true, true, false, true, true, false>
1444       //    call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr,
1445       //                           i32 Align, <12 x i1> %gaps.mask
1446       // The cost is estimated as extract all elements (of actual members,
1447       // excluding gaps) from both <4 x i32> vectors and insert into the <12 x
1448       // i32> vector.
1449       InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(
1450           SubVT, DemandedAllSubElts,
1451           /*Insert*/ false, /*Extract*/ true, CostKind);
1452       Cost += ExtSubCost * Indices.size();
1453       Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts,
1454                                                 /*Insert*/ true,
1455                                                 /*Extract*/ false, CostKind);
1456     }
1457 
1458     if (!UseMaskForCond)
1459       return Cost;
1460 
1461     Type *I8Type = Type::getInt8Ty(VT->getContext());
1462 
1463     Cost += thisT()->getReplicationShuffleCost(
1464         I8Type, Factor, NumSubElts,
1465         UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts,
1466         CostKind);
1467 
1468     // The Gaps mask is invariant and created outside the loop, therefore the
1469     // cost of creating it is not accounted for here. However if we have both
1470     // a MaskForGaps and some other mask that guards the execution of the
1471     // memory access, we need to account for the cost of And-ing the two masks
1472     // inside the loop.
1473     if (UseMaskForGaps) {
1474       auto *MaskVT = FixedVectorType::get(I8Type, NumElts);
1475       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT,
1476                                               CostKind);
1477     }
1478 
1479     return Cost;
1480   }
1481 
1482   /// Get intrinsic cost based on arguments.
1483   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1484                                         TTI::TargetCostKind CostKind) {
1485     // Check for generically free intrinsics.
1486     if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0)
1487       return 0;
1488 
1489     // Assume that target intrinsics are cheap.
1490     Intrinsic::ID IID = ICA.getID();
1491     if (Function::isTargetIntrinsic(IID))
1492       return TargetTransformInfo::TCC_Basic;
1493 
1494     if (ICA.isTypeBasedOnly())
1495       return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
1496 
1497     Type *RetTy = ICA.getReturnType();
1498 
1499     ElementCount RetVF =
1500         (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount()
1501                              : ElementCount::getFixed(1));
1502     const IntrinsicInst *I = ICA.getInst();
1503     const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
1504     FastMathFlags FMF = ICA.getFlags();
1505     switch (IID) {
1506     default:
1507       break;
1508 
1509     case Intrinsic::powi:
1510       if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) {
1511         bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize();
1512         if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(),
1513                                                ShouldOptForSize)) {
1514           // The cost is modeled on the expansion performed by ExpandPowI in
1515           // SelectionDAGBuilder.
1516           APInt Exponent = RHSC->getValue().abs();
1517           unsigned ActiveBits = Exponent.getActiveBits();
1518           unsigned PopCount = Exponent.popcount();
1519           InstructionCost Cost = (ActiveBits + PopCount - 2) *
1520                                  thisT()->getArithmeticInstrCost(
1521                                      Instruction::FMul, RetTy, CostKind);
1522           if (RHSC->isNegative())
1523             Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy,
1524                                                     CostKind);
1525           return Cost;
1526         }
1527       }
1528       break;
1529     case Intrinsic::cttz:
1530       // FIXME: If necessary, this should go in target-specific overrides.
1531       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz(RetTy))
1532         return TargetTransformInfo::TCC_Basic;
1533       break;
1534 
1535     case Intrinsic::ctlz:
1536       // FIXME: If necessary, this should go in target-specific overrides.
1537       if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz(RetTy))
1538         return TargetTransformInfo::TCC_Basic;
1539       break;
1540 
1541     case Intrinsic::memcpy:
1542       return thisT()->getMemcpyCost(ICA.getInst());
1543 
1544     case Intrinsic::masked_scatter: {
1545       const Value *Mask = Args[3];
1546       bool VarMask = !isa<Constant>(Mask);
1547       Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue();
1548       return thisT()->getGatherScatterOpCost(Instruction::Store,
1549                                              ICA.getArgTypes()[0], Args[1],
1550                                              VarMask, Alignment, CostKind, I);
1551     }
1552     case Intrinsic::masked_gather: {
1553       const Value *Mask = Args[2];
1554       bool VarMask = !isa<Constant>(Mask);
1555       Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue();
1556       return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0],
1557                                              VarMask, Alignment, CostKind, I);
1558     }
1559     case Intrinsic::experimental_stepvector: {
1560       if (isa<ScalableVectorType>(RetTy))
1561         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1562       // The cost of materialising a constant integer vector.
1563       return TargetTransformInfo::TCC_Basic;
1564     }
1565     case Intrinsic::vector_extract: {
1566       // FIXME: Handle case where a scalable vector is extracted from a scalable
1567       // vector
1568       if (isa<ScalableVectorType>(RetTy))
1569         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1570       unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue();
1571       return thisT()->getShuffleCost(
1572           TTI::SK_ExtractSubvector, cast<VectorType>(Args[0]->getType()),
1573           std::nullopt, CostKind, Index, cast<VectorType>(RetTy));
1574     }
1575     case Intrinsic::vector_insert: {
1576       // FIXME: Handle case where a scalable vector is inserted into a scalable
1577       // vector
1578       if (isa<ScalableVectorType>(Args[1]->getType()))
1579         return BaseT::getIntrinsicInstrCost(ICA, CostKind);
1580       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1581       return thisT()->getShuffleCost(
1582           TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()),
1583           std::nullopt, CostKind, Index, cast<VectorType>(Args[1]->getType()));
1584     }
1585     case Intrinsic::experimental_vector_reverse: {
1586       return thisT()->getShuffleCost(
1587           TTI::SK_Reverse, cast<VectorType>(Args[0]->getType()), std::nullopt,
1588           CostKind, 0, cast<VectorType>(RetTy));
1589     }
1590     case Intrinsic::experimental_vector_splice: {
1591       unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue();
1592       return thisT()->getShuffleCost(
1593           TTI::SK_Splice, cast<VectorType>(Args[0]->getType()), std::nullopt,
1594           CostKind, Index, cast<VectorType>(RetTy));
1595     }
1596     case Intrinsic::vector_reduce_add:
1597     case Intrinsic::vector_reduce_mul:
1598     case Intrinsic::vector_reduce_and:
1599     case Intrinsic::vector_reduce_or:
1600     case Intrinsic::vector_reduce_xor:
1601     case Intrinsic::vector_reduce_smax:
1602     case Intrinsic::vector_reduce_smin:
1603     case Intrinsic::vector_reduce_fmax:
1604     case Intrinsic::vector_reduce_fmin:
1605     case Intrinsic::vector_reduce_fmaximum:
1606     case Intrinsic::vector_reduce_fminimum:
1607     case Intrinsic::vector_reduce_umax:
1608     case Intrinsic::vector_reduce_umin: {
1609       IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1);
1610       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1611     }
1612     case Intrinsic::vector_reduce_fadd:
1613     case Intrinsic::vector_reduce_fmul: {
1614       IntrinsicCostAttributes Attrs(
1615           IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1);
1616       return getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1617     }
1618     case Intrinsic::fshl:
1619     case Intrinsic::fshr: {
1620       const Value *X = Args[0];
1621       const Value *Y = Args[1];
1622       const Value *Z = Args[2];
1623       const TTI::OperandValueInfo OpInfoX = TTI::getOperandInfo(X);
1624       const TTI::OperandValueInfo OpInfoY = TTI::getOperandInfo(Y);
1625       const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(Z);
1626       const TTI::OperandValueInfo OpInfoBW =
1627         {TTI::OK_UniformConstantValue,
1628          isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2
1629          : TTI::OP_None};
1630 
1631       // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
1632       // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
1633       InstructionCost Cost = 0;
1634       Cost +=
1635           thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind);
1636       Cost +=
1637           thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind);
1638       Cost += thisT()->getArithmeticInstrCost(
1639           BinaryOperator::Shl, RetTy, CostKind, OpInfoX,
1640           {OpInfoZ.Kind, TTI::OP_None});
1641       Cost += thisT()->getArithmeticInstrCost(
1642           BinaryOperator::LShr, RetTy, CostKind, OpInfoY,
1643           {OpInfoZ.Kind, TTI::OP_None});
1644       // Non-constant shift amounts requires a modulo.
1645       if (!OpInfoZ.isConstant())
1646         Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy,
1647                                                 CostKind, OpInfoZ, OpInfoBW);
1648       // For non-rotates (X != Y) we must add shift-by-zero handling costs.
1649       if (X != Y) {
1650         Type *CondTy = RetTy->getWithNewBitWidth(1);
1651         Cost +=
1652             thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1653                                         CmpInst::ICMP_EQ, CostKind);
1654         Cost +=
1655             thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1656                                         CmpInst::ICMP_EQ, CostKind);
1657       }
1658       return Cost;
1659     }
1660     case Intrinsic::get_active_lane_mask: {
1661       EVT ResVT = getTLI()->getValueType(DL, RetTy, true);
1662       EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true);
1663 
1664       // If we're not expanding the intrinsic then we assume this is cheap
1665       // to implement.
1666       if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) {
1667         return getTypeLegalizationCost(RetTy).first;
1668       }
1669 
1670       // Create the expanded types that will be used to calculate the uadd_sat
1671       // operation.
1672       Type *ExpRetTy = VectorType::get(
1673           ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1674       IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF);
1675       InstructionCost Cost =
1676           thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1677       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy,
1678                                           CmpInst::ICMP_ULT, CostKind);
1679       return Cost;
1680     }
1681     }
1682 
1683     // Assume that we need to scalarize this intrinsic.
1684     // Compute the scalarization overhead based on Args for a vector
1685     // intrinsic.
1686     InstructionCost ScalarizationCost = InstructionCost::getInvalid();
1687     if (RetVF.isVector() && !RetVF.isScalable()) {
1688       ScalarizationCost = 0;
1689       if (!RetTy->isVoidTy())
1690         ScalarizationCost += getScalarizationOverhead(
1691             cast<VectorType>(RetTy),
1692             /*Insert*/ true, /*Extract*/ false, CostKind);
1693       ScalarizationCost +=
1694           getOperandsScalarizationOverhead(Args, ICA.getArgTypes(), CostKind);
1695     }
1696 
1697     IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I,
1698                                   ScalarizationCost);
1699     return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind);
1700   }
1701 
1702   /// Get intrinsic cost based on argument types.
1703   /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the
1704   /// cost of scalarizing the arguments and the return value will be computed
1705   /// based on types.
1706   InstructionCost
1707   getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
1708                                  TTI::TargetCostKind CostKind) {
1709     Intrinsic::ID IID = ICA.getID();
1710     Type *RetTy = ICA.getReturnType();
1711     const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes();
1712     FastMathFlags FMF = ICA.getFlags();
1713     InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost();
1714     bool SkipScalarizationCost = ICA.skipScalarizationCost();
1715 
1716     VectorType *VecOpTy = nullptr;
1717     if (!Tys.empty()) {
1718       // The vector reduction operand is operand 0 except for fadd/fmul.
1719       // Their operand 0 is a scalar start value, so the vector op is operand 1.
1720       unsigned VecTyIndex = 0;
1721       if (IID == Intrinsic::vector_reduce_fadd ||
1722           IID == Intrinsic::vector_reduce_fmul)
1723         VecTyIndex = 1;
1724       assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes");
1725       VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]);
1726     }
1727 
1728     // Library call cost - other than size, make it expensive.
1729     unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10;
1730     unsigned ISD = 0;
1731     switch (IID) {
1732     default: {
1733       // Scalable vectors cannot be scalarized, so return Invalid.
1734       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
1735             return isa<ScalableVectorType>(Ty);
1736           }))
1737         return InstructionCost::getInvalid();
1738 
1739       // Assume that we need to scalarize this intrinsic.
1740       InstructionCost ScalarizationCost =
1741           SkipScalarizationCost ? ScalarizationCostPassed : 0;
1742       unsigned ScalarCalls = 1;
1743       Type *ScalarRetTy = RetTy;
1744       if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
1745         if (!SkipScalarizationCost)
1746           ScalarizationCost = getScalarizationOverhead(
1747               RetVTy, /*Insert*/ true, /*Extract*/ false, CostKind);
1748         ScalarCalls = std::max(ScalarCalls,
1749                                cast<FixedVectorType>(RetVTy)->getNumElements());
1750         ScalarRetTy = RetTy->getScalarType();
1751       }
1752       SmallVector<Type *, 4> ScalarTys;
1753       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
1754         Type *Ty = Tys[i];
1755         if (auto *VTy = dyn_cast<VectorType>(Ty)) {
1756           if (!SkipScalarizationCost)
1757             ScalarizationCost += getScalarizationOverhead(
1758                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
1759           ScalarCalls = std::max(ScalarCalls,
1760                                  cast<FixedVectorType>(VTy)->getNumElements());
1761           Ty = Ty->getScalarType();
1762         }
1763         ScalarTys.push_back(Ty);
1764       }
1765       if (ScalarCalls == 1)
1766         return 1; // Return cost of a scalar intrinsic. Assume it to be cheap.
1767 
1768       IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF);
1769       InstructionCost ScalarCost =
1770           thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind);
1771 
1772       return ScalarCalls * ScalarCost + ScalarizationCost;
1773     }
1774     // Look for intrinsics that can be lowered directly or turned into a scalar
1775     // intrinsic call.
1776     case Intrinsic::sqrt:
1777       ISD = ISD::FSQRT;
1778       break;
1779     case Intrinsic::sin:
1780       ISD = ISD::FSIN;
1781       break;
1782     case Intrinsic::cos:
1783       ISD = ISD::FCOS;
1784       break;
1785     case Intrinsic::exp:
1786       ISD = ISD::FEXP;
1787       break;
1788     case Intrinsic::exp2:
1789       ISD = ISD::FEXP2;
1790       break;
1791     case Intrinsic::log:
1792       ISD = ISD::FLOG;
1793       break;
1794     case Intrinsic::log10:
1795       ISD = ISD::FLOG10;
1796       break;
1797     case Intrinsic::log2:
1798       ISD = ISD::FLOG2;
1799       break;
1800     case Intrinsic::fabs:
1801       ISD = ISD::FABS;
1802       break;
1803     case Intrinsic::canonicalize:
1804       ISD = ISD::FCANONICALIZE;
1805       break;
1806     case Intrinsic::minnum:
1807       ISD = ISD::FMINNUM;
1808       break;
1809     case Intrinsic::maxnum:
1810       ISD = ISD::FMAXNUM;
1811       break;
1812     case Intrinsic::minimum:
1813       ISD = ISD::FMINIMUM;
1814       break;
1815     case Intrinsic::maximum:
1816       ISD = ISD::FMAXIMUM;
1817       break;
1818     case Intrinsic::copysign:
1819       ISD = ISD::FCOPYSIGN;
1820       break;
1821     case Intrinsic::floor:
1822       ISD = ISD::FFLOOR;
1823       break;
1824     case Intrinsic::ceil:
1825       ISD = ISD::FCEIL;
1826       break;
1827     case Intrinsic::trunc:
1828       ISD = ISD::FTRUNC;
1829       break;
1830     case Intrinsic::nearbyint:
1831       ISD = ISD::FNEARBYINT;
1832       break;
1833     case Intrinsic::rint:
1834       ISD = ISD::FRINT;
1835       break;
1836     case Intrinsic::round:
1837       ISD = ISD::FROUND;
1838       break;
1839     case Intrinsic::roundeven:
1840       ISD = ISD::FROUNDEVEN;
1841       break;
1842     case Intrinsic::pow:
1843       ISD = ISD::FPOW;
1844       break;
1845     case Intrinsic::fma:
1846       ISD = ISD::FMA;
1847       break;
1848     case Intrinsic::fmuladd:
1849       ISD = ISD::FMA;
1850       break;
1851     case Intrinsic::experimental_constrained_fmuladd:
1852       ISD = ISD::STRICT_FMA;
1853       break;
1854     // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
1855     case Intrinsic::lifetime_start:
1856     case Intrinsic::lifetime_end:
1857     case Intrinsic::sideeffect:
1858     case Intrinsic::pseudoprobe:
1859     case Intrinsic::arithmetic_fence:
1860       return 0;
1861     case Intrinsic::masked_store: {
1862       Type *Ty = Tys[0];
1863       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1864       return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
1865                                             CostKind);
1866     }
1867     case Intrinsic::masked_load: {
1868       Type *Ty = RetTy;
1869       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
1870       return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
1871                                             CostKind);
1872     }
1873     case Intrinsic::vector_reduce_add:
1874       return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy,
1875                                                  std::nullopt, CostKind);
1876     case Intrinsic::vector_reduce_mul:
1877       return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy,
1878                                                  std::nullopt, CostKind);
1879     case Intrinsic::vector_reduce_and:
1880       return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy,
1881                                                  std::nullopt, CostKind);
1882     case Intrinsic::vector_reduce_or:
1883       return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy,
1884                                                  std::nullopt, CostKind);
1885     case Intrinsic::vector_reduce_xor:
1886       return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy,
1887                                                  std::nullopt, CostKind);
1888     case Intrinsic::vector_reduce_fadd:
1889       return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy,
1890                                                  FMF, CostKind);
1891     case Intrinsic::vector_reduce_fmul:
1892       return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy,
1893                                                  FMF, CostKind);
1894     case Intrinsic::vector_reduce_smax:
1895       return thisT()->getMinMaxReductionCost(Intrinsic::smax, VecOpTy,
1896                                              ICA.getFlags(), CostKind);
1897     case Intrinsic::vector_reduce_smin:
1898       return thisT()->getMinMaxReductionCost(Intrinsic::smin, VecOpTy,
1899                                              ICA.getFlags(), CostKind);
1900     case Intrinsic::vector_reduce_umax:
1901       return thisT()->getMinMaxReductionCost(Intrinsic::umax, VecOpTy,
1902                                              ICA.getFlags(), CostKind);
1903     case Intrinsic::vector_reduce_umin:
1904       return thisT()->getMinMaxReductionCost(Intrinsic::umin, VecOpTy,
1905                                              ICA.getFlags(), CostKind);
1906     case Intrinsic::vector_reduce_fmax:
1907       return thisT()->getMinMaxReductionCost(Intrinsic::maxnum, VecOpTy,
1908                                              ICA.getFlags(), CostKind);
1909     case Intrinsic::vector_reduce_fmin:
1910       return thisT()->getMinMaxReductionCost(Intrinsic::minnum, VecOpTy,
1911                                              ICA.getFlags(), CostKind);
1912     case Intrinsic::vector_reduce_fmaximum:
1913       return thisT()->getMinMaxReductionCost(Intrinsic::maximum, VecOpTy,
1914                                              ICA.getFlags(), CostKind);
1915     case Intrinsic::vector_reduce_fminimum:
1916       return thisT()->getMinMaxReductionCost(Intrinsic::minimum, VecOpTy,
1917                                              ICA.getFlags(), CostKind);
1918     case Intrinsic::abs: {
1919       // abs(X) = select(icmp(X,0),X,sub(0,X))
1920       Type *CondTy = RetTy->getWithNewBitWidth(1);
1921       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1922       InstructionCost Cost = 0;
1923       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1924                                           Pred, CostKind);
1925       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1926                                           Pred, CostKind);
1927       // TODO: Should we add an OperandValueProperties::OP_Zero property?
1928       Cost += thisT()->getArithmeticInstrCost(
1929          BinaryOperator::Sub, RetTy, CostKind, {TTI::OK_UniformConstantValue, TTI::OP_None});
1930       return Cost;
1931     }
1932     case Intrinsic::smax:
1933     case Intrinsic::smin:
1934     case Intrinsic::umax:
1935     case Intrinsic::umin: {
1936       // minmax(X,Y) = select(icmp(X,Y),X,Y)
1937       Type *CondTy = RetTy->getWithNewBitWidth(1);
1938       bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin;
1939       CmpInst::Predicate Pred =
1940           IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT;
1941       InstructionCost Cost = 0;
1942       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1943                                           Pred, CostKind);
1944       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1945                                           Pred, CostKind);
1946       return Cost;
1947     }
1948     case Intrinsic::sadd_sat:
1949     case Intrinsic::ssub_sat: {
1950       Type *CondTy = RetTy->getWithNewBitWidth(1);
1951 
1952       Type *OpTy = StructType::create({RetTy, CondTy});
1953       Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat
1954                                      ? Intrinsic::sadd_with_overflow
1955                                      : Intrinsic::ssub_with_overflow;
1956       CmpInst::Predicate Pred = CmpInst::ICMP_SGT;
1957 
1958       // SatMax -> Overflow && SumDiff < 0
1959       // SatMin -> Overflow && SumDiff >= 0
1960       InstructionCost Cost = 0;
1961       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1962                                     nullptr, ScalarizationCostPassed);
1963       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1964       Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy,
1965                                           Pred, CostKind);
1966       Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy,
1967                                               CondTy, Pred, CostKind);
1968       return Cost;
1969     }
1970     case Intrinsic::uadd_sat:
1971     case Intrinsic::usub_sat: {
1972       Type *CondTy = RetTy->getWithNewBitWidth(1);
1973 
1974       Type *OpTy = StructType::create({RetTy, CondTy});
1975       Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat
1976                                      ? Intrinsic::uadd_with_overflow
1977                                      : Intrinsic::usub_with_overflow;
1978 
1979       InstructionCost Cost = 0;
1980       IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF,
1981                                     nullptr, ScalarizationCostPassed);
1982       Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind);
1983       Cost +=
1984           thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1985                                       CmpInst::BAD_ICMP_PREDICATE, CostKind);
1986       return Cost;
1987     }
1988     case Intrinsic::smul_fix:
1989     case Intrinsic::umul_fix: {
1990       unsigned ExtSize = RetTy->getScalarSizeInBits() * 2;
1991       Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize);
1992 
1993       unsigned ExtOp =
1994           IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt;
1995       TTI::CastContextHint CCH = TTI::CastContextHint::None;
1996 
1997       InstructionCost Cost = 0;
1998       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind);
1999       Cost +=
2000           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2001       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy,
2002                                             CCH, CostKind);
2003       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy,
2004                                               CostKind,
2005                                               {TTI::OK_AnyValue, TTI::OP_None},
2006                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2007       Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind,
2008                                               {TTI::OK_AnyValue, TTI::OP_None},
2009                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2010       Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind);
2011       return Cost;
2012     }
2013     case Intrinsic::sadd_with_overflow:
2014     case Intrinsic::ssub_with_overflow: {
2015       Type *SumTy = RetTy->getContainedType(0);
2016       Type *OverflowTy = RetTy->getContainedType(1);
2017       unsigned Opcode = IID == Intrinsic::sadd_with_overflow
2018                             ? BinaryOperator::Add
2019                             : BinaryOperator::Sub;
2020 
2021       //   Add:
2022       //   Overflow -> (Result < LHS) ^ (RHS < 0)
2023       //   Sub:
2024       //   Overflow -> (Result < LHS) ^ (RHS > 0)
2025       InstructionCost Cost = 0;
2026       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2027       Cost += 2 * thisT()->getCmpSelInstrCost(
2028                       Instruction::ICmp, SumTy, OverflowTy,
2029                       CmpInst::ICMP_SGT, CostKind);
2030       Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy,
2031                                               CostKind);
2032       return Cost;
2033     }
2034     case Intrinsic::uadd_with_overflow:
2035     case Intrinsic::usub_with_overflow: {
2036       Type *SumTy = RetTy->getContainedType(0);
2037       Type *OverflowTy = RetTy->getContainedType(1);
2038       unsigned Opcode = IID == Intrinsic::uadd_with_overflow
2039                             ? BinaryOperator::Add
2040                             : BinaryOperator::Sub;
2041       CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow
2042                                     ? CmpInst::ICMP_ULT
2043                                     : CmpInst::ICMP_UGT;
2044 
2045       InstructionCost Cost = 0;
2046       Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind);
2047       Cost +=
2048           thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy,
2049                                       Pred, CostKind);
2050       return Cost;
2051     }
2052     case Intrinsic::smul_with_overflow:
2053     case Intrinsic::umul_with_overflow: {
2054       Type *MulTy = RetTy->getContainedType(0);
2055       Type *OverflowTy = RetTy->getContainedType(1);
2056       unsigned ExtSize = MulTy->getScalarSizeInBits() * 2;
2057       Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize);
2058       bool IsSigned = IID == Intrinsic::smul_with_overflow;
2059 
2060       unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt;
2061       TTI::CastContextHint CCH = TTI::CastContextHint::None;
2062 
2063       InstructionCost Cost = 0;
2064       Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind);
2065       Cost +=
2066           thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2067       Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy,
2068                                             CCH, CostKind);
2069       Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy,
2070                                               CostKind,
2071                                               {TTI::OK_AnyValue, TTI::OP_None},
2072                                               {TTI::OK_UniformConstantValue, TTI::OP_None});
2073 
2074       if (IsSigned)
2075         Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy,
2076                                                 CostKind,
2077                                                 {TTI::OK_AnyValue, TTI::OP_None},
2078                                                 {TTI::OK_UniformConstantValue, TTI::OP_None});
2079 
2080       Cost += thisT()->getCmpSelInstrCost(
2081           BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind);
2082       return Cost;
2083     }
2084     case Intrinsic::fptosi_sat:
2085     case Intrinsic::fptoui_sat: {
2086       if (Tys.empty())
2087         break;
2088       Type *FromTy = Tys[0];
2089       bool IsSigned = IID == Intrinsic::fptosi_sat;
2090 
2091       InstructionCost Cost = 0;
2092       IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy,
2093                                      {FromTy, FromTy});
2094       Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind);
2095       IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy,
2096                                      {FromTy, FromTy});
2097       Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind);
2098       Cost += thisT()->getCastInstrCost(
2099           IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy,
2100           TTI::CastContextHint::None, CostKind);
2101       if (IsSigned) {
2102         Type *CondTy = RetTy->getWithNewBitWidth(1);
2103         Cost += thisT()->getCmpSelInstrCost(
2104             BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2105         Cost += thisT()->getCmpSelInstrCost(
2106             BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind);
2107       }
2108       return Cost;
2109     }
2110     case Intrinsic::ctpop:
2111       ISD = ISD::CTPOP;
2112       // In case of legalization use TCC_Expensive. This is cheaper than a
2113       // library call but still not a cheap instruction.
2114       SingleCallCost = TargetTransformInfo::TCC_Expensive;
2115       break;
2116     case Intrinsic::ctlz:
2117       ISD = ISD::CTLZ;
2118       break;
2119     case Intrinsic::cttz:
2120       ISD = ISD::CTTZ;
2121       break;
2122     case Intrinsic::bswap:
2123       ISD = ISD::BSWAP;
2124       break;
2125     case Intrinsic::bitreverse:
2126       ISD = ISD::BITREVERSE;
2127       break;
2128     }
2129 
2130     const TargetLoweringBase *TLI = getTLI();
2131     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2132 
2133     if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
2134       if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() &&
2135           TLI->isFAbsFree(LT.second)) {
2136         return 0;
2137       }
2138 
2139       // The operation is legal. Assume it costs 1.
2140       // If the type is split to multiple registers, assume that there is some
2141       // overhead to this.
2142       // TODO: Once we have extract/insert subvector cost we need to use them.
2143       if (LT.first > 1)
2144         return (LT.first * 2);
2145       else
2146         return (LT.first * 1);
2147     } else if (!TLI->isOperationExpand(ISD, LT.second)) {
2148       // If the operation is custom lowered then assume
2149       // that the code is twice as expensive.
2150       return (LT.first * 2);
2151     }
2152 
2153     // If we can't lower fmuladd into an FMA estimate the cost as a floating
2154     // point mul followed by an add.
2155     if (IID == Intrinsic::fmuladd)
2156       return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy,
2157                                              CostKind) +
2158              thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy,
2159                                              CostKind);
2160     if (IID == Intrinsic::experimental_constrained_fmuladd) {
2161       IntrinsicCostAttributes FMulAttrs(
2162         Intrinsic::experimental_constrained_fmul, RetTy, Tys);
2163       IntrinsicCostAttributes FAddAttrs(
2164         Intrinsic::experimental_constrained_fadd, RetTy, Tys);
2165       return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) +
2166              thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind);
2167     }
2168 
2169     // Else, assume that we need to scalarize this intrinsic. For math builtins
2170     // this will emit a costly libcall, adding call overhead and spills. Make it
2171     // very expensive.
2172     if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) {
2173       // Scalable vectors cannot be scalarized, so return Invalid.
2174       if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) {
2175             return isa<ScalableVectorType>(Ty);
2176           }))
2177         return InstructionCost::getInvalid();
2178 
2179       InstructionCost ScalarizationCost =
2180           SkipScalarizationCost
2181               ? ScalarizationCostPassed
2182               : getScalarizationOverhead(RetVTy, /*Insert*/ true,
2183                                          /*Extract*/ false, CostKind);
2184 
2185       unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements();
2186       SmallVector<Type *, 4> ScalarTys;
2187       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2188         Type *Ty = Tys[i];
2189         if (Ty->isVectorTy())
2190           Ty = Ty->getScalarType();
2191         ScalarTys.push_back(Ty);
2192       }
2193       IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF);
2194       InstructionCost ScalarCost =
2195           thisT()->getIntrinsicInstrCost(Attrs, CostKind);
2196       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
2197         if (auto *VTy = dyn_cast<VectorType>(Tys[i])) {
2198           if (!ICA.skipScalarizationCost())
2199             ScalarizationCost += getScalarizationOverhead(
2200                 VTy, /*Insert*/ false, /*Extract*/ true, CostKind);
2201           ScalarCalls = std::max(ScalarCalls,
2202                                  cast<FixedVectorType>(VTy)->getNumElements());
2203         }
2204       }
2205       return ScalarCalls * ScalarCost + ScalarizationCost;
2206     }
2207 
2208     // This is going to be turned into a library call, make it expensive.
2209     return SingleCallCost;
2210   }
2211 
2212   /// Compute a cost of the given call instruction.
2213   ///
2214   /// Compute the cost of calling function F with return type RetTy and
2215   /// argument types Tys. F might be nullptr, in this case the cost of an
2216   /// arbitrary call with the specified signature will be returned.
2217   /// This is used, for instance,  when we estimate call of a vector
2218   /// counterpart of the given function.
2219   /// \param F Called function, might be nullptr.
2220   /// \param RetTy Return value types.
2221   /// \param Tys Argument types.
2222   /// \returns The cost of Call instruction.
2223   InstructionCost getCallInstrCost(Function *F, Type *RetTy,
2224                                    ArrayRef<Type *> Tys,
2225                                    TTI::TargetCostKind CostKind) {
2226     return 10;
2227   }
2228 
2229   unsigned getNumberOfParts(Type *Tp) {
2230     std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
2231     return LT.first.isValid() ? *LT.first.getValue() : 0;
2232   }
2233 
2234   InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *,
2235                                             const SCEV *) {
2236     return 0;
2237   }
2238 
2239   /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics.
2240   /// We're assuming that reduction operation are performing the following way:
2241   ///
2242   /// %val1 = shufflevector<n x t> %val, <n x t> %undef,
2243   /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef>
2244   ///            \----------------v-------------/  \----------v------------/
2245   ///                            n/2 elements               n/2 elements
2246   /// %red1 = op <n x t> %val, <n x t> val1
2247   /// After this operation we have a vector %red1 where only the first n/2
2248   /// elements are meaningful, the second n/2 elements are undefined and can be
2249   /// dropped. All other operations are actually working with the vector of
2250   /// length n/2, not n, though the real vector length is still n.
2251   /// %val2 = shufflevector<n x t> %red1, <n x t> %undef,
2252   /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef>
2253   ///            \----------------v-------------/  \----------v------------/
2254   ///                            n/4 elements               3*n/4 elements
2255   /// %red2 = op <n x t> %red1, <n x t> val2  - working with the vector of
2256   /// length n/2, the resulting vector has length n/4 etc.
2257   ///
2258   /// The cost model should take into account that the actual length of the
2259   /// vector is reduced on each iteration.
2260   InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty,
2261                                        TTI::TargetCostKind CostKind) {
2262     // Targets must implement a default value for the scalable case, since
2263     // we don't know how many lanes the vector has.
2264     if (isa<ScalableVectorType>(Ty))
2265       return InstructionCost::getInvalid();
2266 
2267     Type *ScalarTy = Ty->getElementType();
2268     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2269     if ((Opcode == Instruction::Or || Opcode == Instruction::And) &&
2270         ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) &&
2271         NumVecElts >= 2) {
2272       // Or reduction for i1 is represented as:
2273       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2274       // %res = cmp ne iReduxWidth %val, 0
2275       // And reduction for i1 is represented as:
2276       // %val = bitcast <ReduxWidth x i1> to iReduxWidth
2277       // %res = cmp eq iReduxWidth %val, 11111
2278       Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts);
2279       return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty,
2280                                        TTI::CastContextHint::None, CostKind) +
2281              thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy,
2282                                          CmpInst::makeCmpResultType(ValTy),
2283                                          CmpInst::BAD_ICMP_PREDICATE, CostKind);
2284     }
2285     unsigned NumReduxLevels = Log2_32(NumVecElts);
2286     InstructionCost ArithCost = 0;
2287     InstructionCost ShuffleCost = 0;
2288     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2289     unsigned LongVectorCount = 0;
2290     unsigned MVTLen =
2291         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2292     while (NumVecElts > MVTLen) {
2293       NumVecElts /= 2;
2294       VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2295       ShuffleCost +=
2296           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2297                                   CostKind, NumVecElts, SubTy);
2298       ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind);
2299       Ty = SubTy;
2300       ++LongVectorCount;
2301     }
2302 
2303     NumReduxLevels -= LongVectorCount;
2304 
2305     // The minimal length of the vector is limited by the real length of vector
2306     // operations performed on the current platform. That's why several final
2307     // reduction operations are performed on the vectors with the same
2308     // architecture-dependent length.
2309 
2310     // By default reductions need one shuffle per reduction level.
2311     ShuffleCost +=
2312         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2313                                                  std::nullopt, CostKind, 0, Ty);
2314     ArithCost +=
2315         NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind);
2316     return ShuffleCost + ArithCost +
2317            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2318                                        CostKind, 0, nullptr, nullptr);
2319   }
2320 
2321   /// Try to calculate the cost of performing strict (in-order) reductions,
2322   /// which involves doing a sequence of floating point additions in lane
2323   /// order, starting with an initial value. For example, consider a scalar
2324   /// initial value 'InitVal' of type float and a vector of type <4 x float>:
2325   ///
2326   ///   Vector = <float %v0, float %v1, float %v2, float %v3>
2327   ///
2328   ///   %add1 = %InitVal + %v0
2329   ///   %add2 = %add1 + %v1
2330   ///   %add3 = %add2 + %v2
2331   ///   %add4 = %add3 + %v3
2332   ///
2333   /// As a simple estimate we can say the cost of such a reduction is 4 times
2334   /// the cost of a scalar FP addition. We can only estimate the costs for
2335   /// fixed-width vectors here because for scalable vectors we do not know the
2336   /// runtime number of operations.
2337   InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty,
2338                                           TTI::TargetCostKind CostKind) {
2339     // Targets must implement a default value for the scalable case, since
2340     // we don't know how many lanes the vector has.
2341     if (isa<ScalableVectorType>(Ty))
2342       return InstructionCost::getInvalid();
2343 
2344     auto *VTy = cast<FixedVectorType>(Ty);
2345     InstructionCost ExtractCost = getScalarizationOverhead(
2346         VTy, /*Insert=*/false, /*Extract=*/true, CostKind);
2347     InstructionCost ArithCost = thisT()->getArithmeticInstrCost(
2348         Opcode, VTy->getElementType(), CostKind);
2349     ArithCost *= VTy->getNumElements();
2350 
2351     return ExtractCost + ArithCost;
2352   }
2353 
2354   InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
2355                                              std::optional<FastMathFlags> FMF,
2356                                              TTI::TargetCostKind CostKind) {
2357     assert(Ty && "Unknown reduction vector type");
2358     if (TTI::requiresOrderedReduction(FMF))
2359       return getOrderedReductionCost(Opcode, Ty, CostKind);
2360     return getTreeReductionCost(Opcode, Ty, CostKind);
2361   }
2362 
2363   /// Try to calculate op costs for min/max reduction operations.
2364   /// \param CondTy Conditional type for the Select instruction.
2365   InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
2366                                          FastMathFlags FMF,
2367                                          TTI::TargetCostKind CostKind) {
2368     // Targets must implement a default value for the scalable case, since
2369     // we don't know how many lanes the vector has.
2370     if (isa<ScalableVectorType>(Ty))
2371       return InstructionCost::getInvalid();
2372 
2373     Type *ScalarTy = Ty->getElementType();
2374     unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
2375     unsigned NumReduxLevels = Log2_32(NumVecElts);
2376     InstructionCost MinMaxCost = 0;
2377     InstructionCost ShuffleCost = 0;
2378     std::pair<InstructionCost, MVT> LT = thisT()->getTypeLegalizationCost(Ty);
2379     unsigned LongVectorCount = 0;
2380     unsigned MVTLen =
2381         LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
2382     while (NumVecElts > MVTLen) {
2383       NumVecElts /= 2;
2384       auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts);
2385 
2386       ShuffleCost +=
2387           thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, std::nullopt,
2388                                   CostKind, NumVecElts, SubTy);
2389 
2390       IntrinsicCostAttributes Attrs(IID, SubTy, {SubTy, SubTy}, FMF);
2391       MinMaxCost += getIntrinsicInstrCost(Attrs, CostKind);
2392       Ty = SubTy;
2393       ++LongVectorCount;
2394     }
2395 
2396     NumReduxLevels -= LongVectorCount;
2397 
2398     // The minimal length of the vector is limited by the real length of vector
2399     // operations performed on the current platform. That's why several final
2400     // reduction opertions are perfomed on the vectors with the same
2401     // architecture-dependent length.
2402     ShuffleCost +=
2403         NumReduxLevels * thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty,
2404                                                  std::nullopt, CostKind, 0, Ty);
2405     IntrinsicCostAttributes Attrs(IID, Ty, {Ty, Ty}, FMF);
2406     MinMaxCost += NumReduxLevels * getIntrinsicInstrCost(Attrs, CostKind);
2407     // The last min/max should be in vector registers and we counted it above.
2408     // So just need a single extractelement.
2409     return ShuffleCost + MinMaxCost +
2410            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty,
2411                                        CostKind, 0, nullptr, nullptr);
2412   }
2413 
2414   InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned,
2415                                            Type *ResTy, VectorType *Ty,
2416                                            FastMathFlags FMF,
2417                                            TTI::TargetCostKind CostKind) {
2418     // Without any native support, this is equivalent to the cost of
2419     // vecreduce.opcode(ext(Ty A)).
2420     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2421     InstructionCost RedCost =
2422         thisT()->getArithmeticReductionCost(Opcode, ExtTy, FMF, CostKind);
2423     InstructionCost ExtCost = thisT()->getCastInstrCost(
2424         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2425         TTI::CastContextHint::None, CostKind);
2426 
2427     return RedCost + ExtCost;
2428   }
2429 
2430   InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
2431                                          VectorType *Ty,
2432                                          TTI::TargetCostKind CostKind) {
2433     // Without any native support, this is equivalent to the cost of
2434     // vecreduce.add(mul(ext(Ty A), ext(Ty B))) or
2435     // vecreduce.add(mul(A, B)).
2436     VectorType *ExtTy = VectorType::get(ResTy, Ty);
2437     InstructionCost RedCost = thisT()->getArithmeticReductionCost(
2438         Instruction::Add, ExtTy, std::nullopt, CostKind);
2439     InstructionCost ExtCost = thisT()->getCastInstrCost(
2440         IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty,
2441         TTI::CastContextHint::None, CostKind);
2442 
2443     InstructionCost MulCost =
2444         thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind);
2445 
2446     return RedCost + MulCost + 2 * ExtCost;
2447   }
2448 
2449   InstructionCost getVectorSplitCost() { return 1; }
2450 
2451   /// @}
2452 };
2453 
2454 /// Concrete BasicTTIImpl that can be used if no further customization
2455 /// is needed.
2456 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> {
2457   using BaseT = BasicTTIImplBase<BasicTTIImpl>;
2458 
2459   friend class BasicTTIImplBase<BasicTTIImpl>;
2460 
2461   const TargetSubtargetInfo *ST;
2462   const TargetLoweringBase *TLI;
2463 
2464   const TargetSubtargetInfo *getST() const { return ST; }
2465   const TargetLoweringBase *getTLI() const { return TLI; }
2466 
2467 public:
2468   explicit BasicTTIImpl(const TargetMachine *TM, const Function &F);
2469 };
2470 
2471 } // end namespace llvm
2472 
2473 #endif // LLVM_CODEGEN_BASICTTIIMPL_H
2474