//===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // /// \file /// This file provides a helper that implements much of the TTI interface in /// terms of the target-independent code generator and TargetLowering /// interfaces. // //===----------------------------------------------------------------------===// #ifndef LLVM_CODEGEN_BASICTTIIMPL_H #define LLVM_CODEGEN_BASICTTIIMPL_H #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfoImpl.h" #include "llvm/CodeGen/ISDOpcodes.h" #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MachineValueType.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include #include #include #include #include namespace llvm { class Function; class GlobalValue; class LLVMContext; class ScalarEvolution; class SCEV; class TargetMachine; extern cl::opt PartialUnrollingThreshold; /// Base class which can be used to help build a TTI implementation. /// /// This class provides as much implementation of the TTI interface as is /// possible using the target independent parts of the code generator. /// /// In order to subclass it, your class must implement a getST() method to /// return the subtarget, and a getTLI() method to return the target lowering. /// We need these methods implemented in the derived class so that this class /// doesn't have to duplicate storage for them. template class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase { private: using BaseT = TargetTransformInfoImplCRTPBase; using TTI = TargetTransformInfo; /// Helper function to access this as a T. T *thisT() { return static_cast(this); } /// Estimate a cost of Broadcast as an extract and sequence of insert /// operations. InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { InstructionCost Cost = 0; // Broadcast cost is equal to the cost of extracting the zero'th element // plus the cost of inserting it into every element of the result vector. Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); } return Cost; } /// Estimate a cost of shuffle as a sequence of extract and insert /// operations. InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { InstructionCost Cost = 0; // Shuffle cost is equal to the cost of extracting element from its argument // plus the cost of inserting them onto the result vector. // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from // index 0 of first vector, index 1 of second vector,index 2 of first // vector and finally index 3 of second vector and insert them at index // <0,1,2,3> of result vector. for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); } return Cost; } /// Estimate a cost of subvector extraction as a sequence of extract and /// insert operations. InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only extract subvectors from vectors"); int NumSubElts = SubVTy->getNumElements(); assert((!isa(VTy) || (Index + NumSubElts) <= (int)cast(VTy)->getNumElements()) && "SK_ExtractSubvector index out of range"); InstructionCost Cost = 0; // Subvector extraction cost is equal to the cost of extracting element from // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i + Index); Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); } return Cost; } /// Estimate a cost of subvector insertion as a sequence of extract and /// insert operations. InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, FixedVectorType *SubVTy) { assert(VTy && SubVTy && "Can only insert subvectors into vectors"); int NumSubElts = SubVTy->getNumElements(); assert((!isa(VTy) || (Index + NumSubElts) <= (int)cast(VTy)->getNumElements()) && "SK_InsertSubvector index out of range"); InstructionCost Cost = 0; // Subvector insertion cost is equal to the cost of extracting element from // the source type plus the cost of inserting them into the result vector // type. for (int i = 0; i != NumSubElts; ++i) { Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i + Index); } return Cost; } /// Local query method delegates up to T which *must* implement this! const TargetSubtargetInfo *getST() const { return static_cast(this)->getST(); } /// Local query method delegates up to T which *must* implement this! const TargetLoweringBase *getTLI() const { return static_cast(this)->getTLI(); } static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { switch (M) { case TTI::MIM_Unindexed: return ISD::UNINDEXED; case TTI::MIM_PreInc: return ISD::PRE_INC; case TTI::MIM_PreDec: return ISD::PRE_DEC; case TTI::MIM_PostInc: return ISD::POST_INC; case TTI::MIM_PostDec: return ISD::POST_DEC; } llvm_unreachable("Unexpected MemIndexedMode"); } InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, bool VariableMask, bool IsGatherScatter, TTI::TargetCostKind CostKind) { // We cannot scalarize scalable vectors, so return Invalid. if (isa(DataTy)) return InstructionCost::getInvalid(); auto *VT = cast(DataTy); // Assume the target does not have support for gather/scatter operations // and provide a rough estimate. // // First, compute the cost of the individual memory operations. InstructionCost AddrExtractCost = IsGatherScatter ? getVectorInstrCost(Instruction::ExtractElement, FixedVectorType::get( PointerType::get(VT->getElementType(), 0), VT->getNumElements()), -1) : 0; InstructionCost LoadCost = VT->getNumElements() * (AddrExtractCost + getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); // Next, compute the cost of packing the result in a vector. InstructionCost PackingCost = getScalarizationOverhead( VT, Opcode != Instruction::Store, Opcode == Instruction::Store); InstructionCost ConditionalCost = 0; if (VariableMask) { // Compute the cost of conditionally executing the memory operations with // variable masks. This includes extracting the individual conditions, a // branches and PHIs to combine the results. // NOTE: Estimating the cost of conditionally executing the memory // operations accurately is quite difficult and the current solution // provides a very rough estimate only. ConditionalCost = VT->getNumElements() * (getVectorInstrCost( Instruction::ExtractElement, FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), VT->getNumElements()), -1) + getCFInstrCost(Instruction::Br, CostKind) + getCFInstrCost(Instruction::PHI, CostKind)); } return LoadCost + PackingCost + ConditionalCost; } protected: explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) : BaseT(DL) {} virtual ~BasicTTIImplBase() = default; using TargetTransformInfoImplBase::DL; public: /// \name Scalar TTI Implementations /// @{ bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, unsigned AddressSpace, Align Alignment, bool *Fast) const { EVT E = EVT::getIntegerVT(Context, BitWidth); return getTLI()->allowsMisalignedMemoryAccesses( E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); } bool hasBranchDivergence() { return false; } bool useGPUDivergenceAnalysis() { return false; } bool isSourceOfDivergence(const Value *V) { return false; } bool isAlwaysUniform(const Value *V) { return false; } unsigned getFlatAddressSpace() { // Return an invalid address space. return -1; } bool collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const { return false; } bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); } unsigned getAssumedAddrSpace(const Value *V) const { return getTLI()->getTargetMachine().getAssumedAddrSpace(V); } std::pair getPredicatedAddrSpace(const Value *V) const { return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); } Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const { return nullptr; } bool isLegalAddImmediate(int64_t imm) { return getTLI()->isLegalAddImmediate(imm); } bool isLegalICmpImmediate(int64_t imm) { return getTLI()->isLegalICmpImmediate(imm); } bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I = nullptr) { TargetLoweringBase::AddrMode AM; AM.BaseGV = BaseGV; AM.BaseOffs = BaseOffset; AM.HasBaseReg = HasBaseReg; AM.Scale = Scale; return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); } unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, Type *ScalarValTy) const { auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); EVT VT = getTLI()->getValueType(DL, SrcTy); if (getTLI()->isOperationLegal(ISD::STORE, VT) || getTLI()->isOperationCustom(ISD::STORE, VT)) return true; EVT ValVT = getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); EVT LegalizedVT = getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); }; while (VF > 2 && IsSupportedByTarget(VF)) VF /= 2; return VF; } bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const { EVT VT = getTLI()->getValueType(DL, Ty); return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); } bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, const DataLayout &DL) const { EVT VT = getTLI()->getValueType(DL, Ty); return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); } bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); } bool isNumRegsMajorCostOfLSR() { return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); } bool isProfitableLSRChainElement(Instruction *I) { return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); } InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { TargetLoweringBase::AddrMode AM; AM.BaseGV = BaseGV; AM.BaseOffs = BaseOffset; AM.HasBaseReg = HasBaseReg; AM.Scale = Scale; return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); } bool isTruncateFree(Type *Ty1, Type *Ty2) { return getTLI()->isTruncateFree(Ty1, Ty2); } bool isProfitableToHoist(Instruction *I) { return getTLI()->isProfitableToHoist(I); } bool useAA() const { return getST()->useAA(); } bool isTypeLegal(Type *Ty) { EVT VT = getTLI()->getValueType(DL, Ty); return getTLI()->isTypeLegal(VT); } unsigned getRegUsageForType(Type *Ty) { EVT ETy = getTLI()->getValueType(DL, Ty); return getTLI()->getNumRegisters(Ty->getContext(), ETy); } InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef Operands, TTI::TargetCostKind CostKind) { return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind); } unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JumpTableSize, ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI) { /// Try to find the estimated number of clusters. Note that the number of /// clusters identified in this function could be different from the actual /// numbers found in lowering. This function ignore switches that are /// lowered with a mix of jump table / bit test / BTree. This function was /// initially intended to be used when estimating the cost of switch in /// inline cost heuristic, but it's a generic cost model to be used in other /// places (e.g., in loop unrolling). unsigned N = SI.getNumCases(); const TargetLoweringBase *TLI = getTLI(); const DataLayout &DL = this->getDataLayout(); JumpTableSize = 0; bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); // Early exit if both a jump table and bit test are not allowed. if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) return N; APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); APInt MinCaseVal = MaxCaseVal; for (auto CI : SI.cases()) { const APInt &CaseVal = CI.getCaseValue()->getValue(); if (CaseVal.sgt(MaxCaseVal)) MaxCaseVal = CaseVal; if (CaseVal.slt(MinCaseVal)) MinCaseVal = CaseVal; } // Check if suitable for a bit test if (N <= DL.getIndexSizeInBits(0u)) { SmallPtrSet Dests; for (auto I : SI.cases()) Dests.insert(I.getCaseSuccessor()); if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, DL)) return 1; } // Check if suitable for a jump table. if (IsJTAllowed) { if (N < 2 || N < TLI->getMinimumJumpTableEntries()) return N; uint64_t Range = (MaxCaseVal - MinCaseVal) .getLimitedValue(std::numeric_limits::max() - 1) + 1; // Check whether a range of clusters is dense enough for a jump table if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { JumpTableSize = Range; return 1; } } return N; } bool shouldBuildLookupTables() { const TargetLoweringBase *TLI = getTLI(); return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); } bool shouldBuildRelLookupTables() const { const TargetMachine &TM = getTLI()->getTargetMachine(); // If non-PIC mode, do not generate a relative lookup table. if (!TM.isPositionIndependent()) return false; /// Relative lookup table entries consist of 32-bit offsets. /// Do not generate relative lookup tables for large code models /// in 64-bit achitectures where 32-bit offsets might not be enough. if (TM.getCodeModel() == CodeModel::Medium || TM.getCodeModel() == CodeModel::Large) return false; Triple TargetTriple = TM.getTargetTriple(); if (!TargetTriple.isArch64Bit()) return false; // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it // there. if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) return false; return true; } bool haveFastSqrt(Type *Ty) { const TargetLoweringBase *TLI = getTLI(); EVT VT = TLI->getValueType(DL, Ty); return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); } bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; } InstructionCost getFPOpCost(Type *Ty) { // Check whether FADD is available, as a proxy for floating-point in // general. const TargetLoweringBase *TLI = getTLI(); EVT VT = TLI->getValueType(DL, Ty); if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) return TargetTransformInfo::TCC_Basic; return TargetTransformInfo::TCC_Expensive; } unsigned getInliningThresholdMultiplier() { return 1; } unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } int getInlinerVectorBonusPercent() { return 150; } void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { // This unrolling functionality is target independent, but to provide some // motivation for its intended use, for x86: // According to the Intel 64 and IA-32 Architectures Optimization Reference // Manual, Intel Core models and later have a loop stream detector (and // associated uop queue) that can benefit from partial unrolling. // The relevant requirements are: // - The loop must have no more than 4 (8 for Nehalem and later) branches // taken, and none of them may be calls. // - The loop can have no more than 18 (28 for Nehalem and later) uops. // According to the Software Optimization Guide for AMD Family 15h // Processors, models 30h-4fh (Steamroller and later) have a loop predictor // and loop buffer which can benefit from partial unrolling. // The relevant requirements are: // - The loop must have fewer than 16 branches // - The loop must have less than 40 uops in all executed loop branches // The number of taken branches in a loop is hard to estimate here, and // benchmarking has revealed that it is better not to be conservative when // estimating the branch count. As a result, we'll ignore the branch limits // until someone finds a case where it matters in practice. unsigned MaxOps; const TargetSubtargetInfo *ST = getST(); if (PartialUnrollingThreshold.getNumOccurrences() > 0) MaxOps = PartialUnrollingThreshold; else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; else return; // Scan the loop: don't unroll loops with calls. for (BasicBlock *BB : L->blocks()) { for (Instruction &I : *BB) { if (isa(I) || isa(I)) { if (const Function *F = cast(I).getCalledFunction()) { if (!thisT()->isLoweredToCall(F)) continue; } if (ORE) { ORE->emit([&]() { return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), L->getHeader()) << "advising against unrolling the loop because it " "contains a " << ore::NV("Call", &I); }); } return; } } } // Enable runtime and partial unrolling up to the specified size. // Enable using trip count upper bound to unroll loops. UP.Partial = UP.Runtime = UP.UpperBound = true; UP.PartialThreshold = MaxOps; // Avoid unrolling when optimizing for size. UP.OptSizeThreshold = 0; UP.PartialOptSizeThreshold = 0; // Set number of instructions optimized when "back edge" // becomes "fall through" to default value of 2. UP.BEInsns = 2; } void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) { PP.PeelCount = 0; PP.AllowPeeling = true; PP.AllowLoopNestsPeeling = false; PP.PeelProfiledIterations = true; } bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) { return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); } bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL) { return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); } PredicationStyle emitGetActiveLaneMask() { return BaseT::emitGetActiveLaneMask(); } Optional instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) { return BaseT::instCombineIntrinsic(IC, II); } Optional simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedMask, KnownBits &Known, bool &KnownBitsComputed) { return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, KnownBitsComputed); } Optional simplifyDemandedVectorEltsIntrinsic( InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function SimplifyAndSetOp) { return BaseT::simplifyDemandedVectorEltsIntrinsic( IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, SimplifyAndSetOp); } InstructionCost getInstructionLatency(const Instruction *I) { if (isa(I)) return getST()->getSchedModel().DefaultLoadLatency; return BaseT::getInstructionLatency(I); } virtual Optional getCacheSize(TargetTransformInfo::CacheLevel Level) const { return Optional( getST()->getCacheSize(static_cast(Level))); } virtual Optional getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { Optional TargetResult = getST()->getCacheAssociativity(static_cast(Level)); if (TargetResult) return TargetResult; return BaseT::getCacheAssociativity(Level); } virtual unsigned getCacheLineSize() const { return getST()->getCacheLineSize(); } virtual unsigned getPrefetchDistance() const { return getST()->getPrefetchDistance(); } virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, unsigned NumStridedMemAccesses, unsigned NumPrefetches, bool HasCall) const { return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, NumPrefetches, HasCall); } virtual unsigned getMaxPrefetchIterationsAhead() const { return getST()->getMaxPrefetchIterationsAhead(); } virtual bool enableWritePrefetching() const { return getST()->enableWritePrefetching(); } /// @} /// \name Vector TTI Implementations /// @{ TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { return TypeSize::getFixed(32); } Optional getMaxVScale() const { return None; } Optional getVScaleForTuning() const { return None; } /// Estimate the overhead of scalarizing an instruction. Insert and Extract /// are set if the demanded result elements need to be inserted and/or /// extracted from vectors. InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract) { /// FIXME: a bitfield is not a reasonable abstraction for talking about /// which elements are needed from a scalable vector if (isa(InTy)) return InstructionCost::getInvalid(); auto *Ty = cast(InTy); assert(DemandedElts.getBitWidth() == Ty->getNumElements() && "Vector size mismatch"); InstructionCost Cost = 0; for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { if (!DemandedElts[i]) continue; if (Insert) Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); if (Extract) Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); } return Cost; } /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, bool Extract) { if (isa(InTy)) return InstructionCost::getInvalid(); auto *Ty = cast(InTy); APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } /// Estimate the overhead of scalarizing an instructions unique /// non-constant operands. The (potentially vector) types to use for each of /// argument are passes via Tys. InstructionCost getOperandsScalarizationOverhead(ArrayRef Args, ArrayRef Tys) { assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); InstructionCost Cost = 0; SmallPtrSet UniqueOperands; for (int I = 0, E = Args.size(); I != E; I++) { // Disregard things like metadata arguments. const Value *A = Args[I]; Type *Ty = Tys[I]; if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && !Ty->isPtrOrPtrVectorTy()) continue; if (!isa(A) && UniqueOperands.insert(A).second) { if (auto *VecTy = dyn_cast(Ty)) Cost += getScalarizationOverhead(VecTy, false, true); } } return Cost; } /// Estimate the overhead of scalarizing the inputs and outputs of an /// instruction, with return type RetTy and arguments Args of type Tys. If /// Args are unknown (empty), then the cost associated with one argument is /// added as a heuristic. InstructionCost getScalarizationOverhead(VectorType *RetTy, ArrayRef Args, ArrayRef Tys) { InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); if (!Args.empty()) Cost += getOperandsScalarizationOverhead(Args, Tys); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. Cost += getScalarizationOverhead(RetTy, false, true); return Cost; } unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } InstructionCost getArithmeticInstrCost( unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, ArrayRef Args = ArrayRef(), const Instruction *CxtI = nullptr) { // Check if any of the operands are vector operands. const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // TODO: Handle more cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); bool IsFloat = Ty->isFPOrFPVectorTy(); // Assume that floating point arithmetic operations cost twice as much as // integer operations. InstructionCost OpCost = (IsFloat ? 2 : 1); if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { // The operation is legal. Assume it costs 1. // TODO: Once we have extract/insert subvector cost we need to use them. return LT.first * OpCost; } if (!TLI->isOperationExpand(ISD, LT.second)) { // If the operation is custom lowered, then assume that the code is twice // as expensive. return LT.first * 2 * OpCost; } // An 'Expand' of URem and SRem is special because it may default // to expanding the operation into a sequence of sub-operations // i.e. X % Y -> X-(X/Y)*Y. if (ISD == ISD::UREM || ISD == ISD::SREM) { bool IsSigned = ISD == ISD::SREM; if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, LT.second) || TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, LT.second)) { unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; InstructionCost DivCost = thisT()->getArithmeticInstrCost( DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo); InstructionCost MulCost = thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); InstructionCost SubCost = thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); return DivCost + MulCost + SubCost; } } // We cannot scalarize scalable vectors, so return Invalid. if (isa(Ty)) return InstructionCost::getInvalid(); // Else, assume that we need to scalarize this op. // TODO: If one of the types get legalized by splitting, handle this // similarly to what getCastInstrCost() does. if (auto *VTy = dyn_cast(Ty)) { InstructionCost Cost = thisT()->getArithmeticInstrCost( Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. SmallVector Tys(Args.size(), Ty); return getScalarizationOverhead(VTy, Args, Tys) + VTy->getNumElements() * Cost; } // We don't know anything about this scalar instruction. return OpCost; } TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef Mask) const { int Limit = Mask.size() * 2; if (Mask.empty() || // Extra check required by isSingleSourceMaskImpl function (called by // ShuffleVectorInst::isSingleSourceMask). any_of(Mask, [Limit](int I) { return I >= Limit; })) return Kind; switch (Kind) { case TTI::SK_PermuteSingleSrc: if (ShuffleVectorInst::isReverseMask(Mask)) return TTI::SK_Reverse; if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) return TTI::SK_Broadcast; break; case TTI::SK_PermuteTwoSrc: if (ShuffleVectorInst::isSelectMask(Mask)) return TTI::SK_Select; if (ShuffleVectorInst::isTransposeMask(Mask)) return TTI::SK_Transpose; break; case TTI::SK_Select: case TTI::SK_Reverse: case TTI::SK_Broadcast: case TTI::SK_Transpose: case TTI::SK_InsertSubvector: case TTI::SK_ExtractSubvector: case TTI::SK_Splice: break; } return Kind; } InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp, ArrayRef Args = None) { switch (improveShuffleKindFromMask(Kind, Mask)) { case TTI::SK_Broadcast: if (auto *FVT = dyn_cast(Tp)) return getBroadcastShuffleOverhead(FVT); return InstructionCost::getInvalid(); case TTI::SK_Select: case TTI::SK_Splice: case TTI::SK_Reverse: case TTI::SK_Transpose: case TTI::SK_PermuteSingleSrc: case TTI::SK_PermuteTwoSrc: if (auto *FVT = dyn_cast(Tp)) return getPermuteShuffleOverhead(FVT); return InstructionCost::getInvalid(); case TTI::SK_ExtractSubvector: return getExtractSubvectorOverhead(Tp, Index, cast(SubTp)); case TTI::SK_InsertSubvector: return getInsertSubvectorOverhead(Tp, Index, cast(SubTp)); } llvm_unreachable("Unknown TTI::ShuffleKind"); } InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) return 0; const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); std::pair SrcLT = TLI->getTypeLegalizationCost(DL, Src); std::pair DstLT = TLI->getTypeLegalizationCost(DL, Dst); TypeSize SrcSize = SrcLT.second.getSizeInBits(); TypeSize DstSize = DstLT.second.getSizeInBits(); bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); switch (Opcode) { default: break; case Instruction::Trunc: // Check for NOOP conversions. if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) return 0; LLVM_FALLTHROUGH; case Instruction::BitCast: // Bitcast between types that are legalized to the same type are free and // assume int to/from ptr of the same size is also free. if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && SrcSize == DstSize) return 0; break; case Instruction::FPExt: if (I && getTLI()->isExtFree(I)) return 0; break; case Instruction::ZExt: if (TLI->isZExtFree(SrcLT.second, DstLT.second)) return 0; LLVM_FALLTHROUGH; case Instruction::SExt: if (I && getTLI()->isExtFree(I)) return 0; // If this is a zext/sext of a load, return 0 if the corresponding // extending load exists on target and the result type is legal. if (CCH == TTI::CastContextHint::Normal) { EVT ExtVT = EVT::getEVT(Dst); EVT LoadVT = EVT::getEVT(Src); unsigned LType = ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); if (DstLT.first == SrcLT.first && TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) return 0; } break; case Instruction::AddrSpaceCast: if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), Dst->getPointerAddressSpace())) return 0; break; } auto *SrcVTy = dyn_cast(Src); auto *DstVTy = dyn_cast(Dst); // If the cast is marked as legal (or promote) then assume low cost. if (SrcLT.first == DstLT.first && TLI->isOperationLegalOrPromote(ISD, DstLT.second)) return SrcLT.first; // Handle scalar conversions. if (!SrcVTy && !DstVTy) { // Just check the op cost. If the operation is legal then assume it costs // 1. if (!TLI->isOperationExpand(ISD, DstLT.second)) return 1; // Assume that illegal scalar instruction are expensive. return 4; } // Check vector-to-vector casts. if (DstVTy && SrcVTy) { // If the cast is between same-sized registers, then the check is simple. if (SrcLT.first == DstLT.first && SrcSize == DstSize) { // Assume that Zext is done using AND. if (Opcode == Instruction::ZExt) return SrcLT.first; // Assume that sext is done using SHL and SRA. if (Opcode == Instruction::SExt) return SrcLT.first * 2; // Just check the op cost. If the operation is legal then assume it // costs // 1 and multiply by the type-legalization overhead. if (!TLI->isOperationExpand(ISD, DstLT.second)) return SrcLT.first * 1; } // If we are legalizing by splitting, query the concrete TTI for the cost // of casting the original vector twice. We also need to factor in the // cost of the split itself. Count that as 1, to be consistent with // TLI->getTypeLegalizationCost(). bool SplitSrc = TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == TargetLowering::TypeSplitVector; bool SplitDst = TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == TargetLowering::TypeSplitVector; if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && DstVTy->getElementCount().isVector()) { Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); T *TTI = static_cast(this); // If both types need to be split then the split is free. InstructionCost SplitCost = (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; return SplitCost + (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, CostKind, I)); } // Scalarization cost is Invalid, can't assume any num elements. if (isa(DstVTy)) return InstructionCost::getInvalid(); // In other cases where the source or destination are illegal, assume // the operation will get scalarized. unsigned Num = cast(DstVTy)->getNumElements(); InstructionCost Cost = thisT()->getCastInstrCost( Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; } // We already handled vector-to-vector and scalar-to-scalar conversions. // This // is where we handle bitcast between vectors and scalars. We need to assume // that the conversion is scalarized in one way or another. if (Opcode == Instruction::BitCast) { // Illegal bitcasts are done by storing and loading from a stack slot. return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); } llvm_unreachable("Unhandled cast"); } InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index) { return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, Index) + thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), TTI::CastContextHint::None, TTI::TCK_RecipThroughput); } InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { return BaseT::getCFInstrCost(Opcode, CostKind, I); } InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { const TargetLoweringBase *TLI = getTLI(); int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); // TODO: Handle other cost kinds. if (CostKind != TTI::TCK_RecipThroughput) return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I); // Selects on vectors are actually vector selects. if (ISD == ISD::SELECT) { assert(CondTy && "CondTy must exist"); if (CondTy->isVectorTy()) ISD = ISD::VSELECT; } std::pair LT = TLI->getTypeLegalizationCost(DL, ValTy); if (!(ValTy->isVectorTy() && !LT.second.isVector()) && !TLI->isOperationExpand(ISD, LT.second)) { // The operation is legal. Assume it costs 1. Multiply // by the type-legalization overhead. return LT.first * 1; } // Otherwise, assume that the cast is scalarized. // TODO: If one of the types get legalized by splitting, handle this // similarly to what getCastInstrCost() does. if (auto *ValVTy = dyn_cast(ValTy)) { if (isa(ValTy)) return InstructionCost::getInvalid(); unsigned Num = cast(ValVTy)->getNumElements(); if (CondTy) CondTy = CondTy->getScalarType(); InstructionCost Cost = thisT()->getCmpSelInstrCost( Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; } // Unknown scalar opcode. return 1; } InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); return LT.first; } InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, int VF, const APInt &DemandedDstElts, TTI::TargetCostKind CostKind) { assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && "Unexpected size of DemandedDstElts."); InstructionCost Cost; auto *SrcVT = FixedVectorType::get(EltTy, VF); auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); // The Mask shuffling cost is extract all the elements of the Mask // and insert each of them Factor times into the wide vector: // // E.g. an interleaved group with factor 3: // %mask = icmp ult <8 x i32> %vec1, %vec2 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> // The cost is estimated as extract all mask elements from the <8xi1> mask // vector and insert them factor times into the <24xi1> shuffled mask // vector. APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, /*Insert*/ false, /*Extract*/ true); Cost += thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, /*Insert*/ true, /*Extract*/ false); return Cost; } InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { assert(!Src->isVoidTy() && "Invalid type"); // Assume types, such as structs, are expensive. if (getTLI()->getValueType(DL, Src, true) == MVT::Other) return 4; std::pair LT = getTLI()->getTypeLegalizationCost(DL, Src); // Assuming that all loads of legal types cost 1. InstructionCost Cost = LT.first; if (CostKind != TTI::TCK_RecipThroughput) return Cost; const DataLayout &DL = this->getDataLayout(); if (Src->isVectorTy() && // In practice it's not currently possible to have a change in lane // length for extending loads or truncating stores so both types should // have the same scalable property. TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), LT.second.getSizeInBits())) { // This is a vector load that legalizes to a larger type than the vector // itself. Unless the corresponding extending load or truncating store is // legal, then this will scalarize. TargetLowering::LegalizeAction LA = TargetLowering::Expand; EVT MemVT = getTLI()->getValueType(DL, Src); if (Opcode == Instruction::Store) LA = getTLI()->getTruncStoreAction(LT.second, MemVT); else LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { // This is a vector load/store for some illegal type that is scalarized. // We must account for the cost of building or decomposing the vector. Cost += getScalarizationOverhead(cast(Src), Opcode != Instruction::Store, Opcode == Instruction::Store); } } return Cost; } InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) { return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, CostKind); } InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I = nullptr) { return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, true, CostKind); } InstructionCost getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false) { // We cannot scalarize scalable vectors, so return Invalid. if (isa(VecTy)) return InstructionCost::getInvalid(); auto *VT = cast(VecTy); unsigned NumElts = VT->getNumElements(); assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); unsigned NumSubElts = NumElts / Factor; auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); // Firstly, the cost of load/store operation. InstructionCost Cost; if (UseMaskForCond || UseMaskForGaps) Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); else Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind); // Legalize the vector type, and get the legalized and unlegalized type // sizes. MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); unsigned VecTyLTSize = VecTyLT.getStoreSize(); // Scale the cost of the memory operation by the fraction of legalized // instructions that will actually be used. We shouldn't account for the // cost of dead instructions since they will be removed. // // E.g., An interleaved load of factor 8: // %vec = load <16 x i64>, <16 x i64>* %ptr // %v0 = shufflevector %vec, undef, <0, 8> // // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be // used (those corresponding to elements [0:1] and [8:9] of the unlegalized // type). The other loads are unused. // // TODO: Note that legalization can turn masked loads/stores into unmasked // (legalized) loads/stores. This can be reflected in the cost. if (Cost.isValid() && VecTySize > VecTyLTSize) { // The number of loads of a legal type it will take to represent a load // of the unlegalized vector type. unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); // The number of elements of the unlegalized type that correspond to a // single legal instruction. unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); // Determine which legal instructions will be used. BitVector UsedInsts(NumLegalInsts, false); for (unsigned Index : Indices) for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); // Scale the cost of the load by the fraction of legal instructions that // will be used. Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); } // Then plus the cost of interleave operation. assert(Indices.size() <= Factor && "Interleaved memory op has too many members"); const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); APInt DemandedLoadStoreElts = APInt::getZero(NumElts); for (unsigned Index : Indices) { assert(Index < Factor && "Invalid index for interleaved memory op"); for (unsigned Elm = 0; Elm < NumSubElts; Elm++) DemandedLoadStoreElts.setBit(Index + Elm * Factor); } if (Opcode == Instruction::Load) { // The interleave cost is similar to extract sub vectors' elements // from the wide vector, and insert them into sub vectors. // // E.g. An interleaved load of factor 2 (with one member of index 0): // %vec = load <8 x i32>, <8 x i32>* %ptr // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 // The cost is estimated as extract elements at 0, 2, 4, 6 from the // <8 x i32> vector and insert them into a <4 x i32> vector. InstructionCost InsSubCost = thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, /*Insert*/ true, /*Extract*/ false); Cost += Indices.size() * InsSubCost; Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ false, /*Extract*/ true); } else { // The interleave cost is extract elements from sub vectors, and // insert them into the wide vector. // // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: // (using VF=4): // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> // %gaps.mask = // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, // i32 Align, <12 x i1> %gaps.mask // The cost is estimated as extract all elements (of actual members, // excluding gaps) from both <4 x i32> vectors and insert into the <12 x // i32> vector. InstructionCost ExtSubCost = thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, /*Insert*/ false, /*Extract*/ true); Cost += ExtSubCost * Indices.size(); Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, /*Insert*/ true, /*Extract*/ false); } if (!UseMaskForCond) return Cost; Type *I8Type = Type::getInt8Ty(VT->getContext()); Cost += thisT()->getReplicationShuffleCost( I8Type, Factor, NumSubElts, UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, CostKind); // The Gaps mask is invariant and created outside the loop, therefore the // cost of creating it is not accounted for here. However if we have both // a MaskForGaps and some other mask that guards the execution of the // memory access, we need to account for the cost of And-ing the two masks // inside the loop. if (UseMaskForGaps) { auto *MaskVT = FixedVectorType::get(I8Type, NumElts); Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, CostKind); } return Cost; } /// Get intrinsic cost based on arguments. InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { // Check for generically free intrinsics. if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) return 0; // Assume that target intrinsics are cheap. Intrinsic::ID IID = ICA.getID(); if (Function::isTargetIntrinsic(IID)) return TargetTransformInfo::TCC_Basic; if (ICA.isTypeBasedOnly()) return getTypeBasedIntrinsicInstrCost(ICA, CostKind); Type *RetTy = ICA.getReturnType(); ElementCount RetVF = (RetTy->isVectorTy() ? cast(RetTy)->getElementCount() : ElementCount::getFixed(1)); const IntrinsicInst *I = ICA.getInst(); const SmallVectorImpl &Args = ICA.getArgs(); FastMathFlags FMF = ICA.getFlags(); switch (IID) { default: break; case Intrinsic::powi: if (auto *RHSC = dyn_cast(Args[1])) { bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), ShouldOptForSize)) { // The cost is modeled on the expansion performed by ExpandPowI in // SelectionDAGBuilder. APInt Exponent = RHSC->getValue().abs(); unsigned ActiveBits = Exponent.getActiveBits(); unsigned PopCount = Exponent.countPopulation(); InstructionCost Cost = (ActiveBits + PopCount - 2) * thisT()->getArithmeticInstrCost( Instruction::FMul, RetTy, CostKind); if (RHSC->getSExtValue() < 0) Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, CostKind); return Cost; } } break; case Intrinsic::cttz: // FIXME: If necessary, this should go in target-specific overrides. if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::ctlz: // FIXME: If necessary, this should go in target-specific overrides. if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) return TargetTransformInfo::TCC_Basic; break; case Intrinsic::memcpy: return thisT()->getMemcpyCost(ICA.getInst()); case Intrinsic::masked_scatter: { const Value *Mask = Args[3]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[2])->getAlignValue(); return thisT()->getGatherScatterOpCost(Instruction::Store, ICA.getArgTypes()[0], Args[1], VarMask, Alignment, CostKind, I); } case Intrinsic::masked_gather: { const Value *Mask = Args[2]; bool VarMask = !isa(Mask); Align Alignment = cast(Args[1])->getAlignValue(); return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], VarMask, Alignment, CostKind, I); } case Intrinsic::experimental_stepvector: { if (isa(RetTy)) return BaseT::getIntrinsicInstrCost(ICA, CostKind); // The cost of materialising a constant integer vector. return TargetTransformInfo::TCC_Basic; } case Intrinsic::vector_extract: { // FIXME: Handle case where a scalable vector is extracted from a scalable // vector if (isa(RetTy)) return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[1])->getZExtValue(); return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, cast(Args[0]->getType()), None, Index, cast(RetTy)); } case Intrinsic::vector_insert: { // FIXME: Handle case where a scalable vector is inserted into a scalable // vector if (isa(Args[1]->getType())) return BaseT::getIntrinsicInstrCost(ICA, CostKind); unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost( TTI::SK_InsertSubvector, cast(Args[0]->getType()), None, Index, cast(Args[1]->getType())); } case Intrinsic::experimental_vector_reverse: { return thisT()->getShuffleCost(TTI::SK_Reverse, cast(Args[0]->getType()), None, 0, cast(RetTy)); } case Intrinsic::experimental_vector_splice: { unsigned Index = cast(Args[2])->getZExtValue(); return thisT()->getShuffleCost(TTI::SK_Splice, cast(Args[0]->getType()), None, Index, cast(RetTy)); } case Intrinsic::vector_reduce_add: case Intrinsic::vector_reduce_mul: case Intrinsic::vector_reduce_and: case Intrinsic::vector_reduce_or: case Intrinsic::vector_reduce_xor: case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: case Intrinsic::vector_reduce_fmin: case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: { IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::vector_reduce_fadd: case Intrinsic::vector_reduce_fmul: { IntrinsicCostAttributes Attrs( IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } case Intrinsic::fshl: case Intrinsic::fshr: { const Value *X = Args[0]; const Value *Y = Args[1]; const Value *Z = Args[2]; TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 : TTI::OP_None; // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) InstructionCost Cost = 0; Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); Cost += thisT()->getArithmeticInstrCost( BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); Cost += thisT()->getArithmeticInstrCost( BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); // Non-constant shift amounts requires a modulo. if (OpKindZ != TTI::OK_UniformConstantValue && OpKindZ != TTI::OK_NonUniformConstantValue) Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, CostKind, OpKindZ, OpKindBW, OpPropsZ, OpPropsBW); // For non-rotates (X != Y) we must add shift-by-zero handling costs. if (X != Y) { Type *CondTy = RetTy->getWithNewBitWidth(1); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, CmpInst::ICMP_EQ, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, CmpInst::ICMP_EQ, CostKind); } return Cost; } case Intrinsic::get_active_lane_mask: { EVT ResVT = getTLI()->getValueType(DL, RetTy, true); EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); // If we're not expanding the intrinsic then we assume this is cheap // to implement. if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, RetTy); return LT.first; } // Create the expanded types that will be used to calculate the uadd_sat // operation. Type *ExpRetTy = VectorType::get( ICA.getArgTypes()[0], cast(RetTy)->getElementCount()); IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); InstructionCost Cost = thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, CmpInst::ICMP_ULT, CostKind); return Cost; } } // Assume that we need to scalarize this intrinsic. // Compute the scalarization overhead based on Args for a vector // intrinsic. InstructionCost ScalarizationCost = InstructionCost::getInvalid(); if (RetVF.isVector() && !RetVF.isScalable()) { ScalarizationCost = 0; if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, ScalarizationCost); return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); } /// Get intrinsic cost based on argument types. /// If ScalarizationCostPassed is std::numeric_limits::max(), the /// cost of scalarizing the arguments and the return value will be computed /// based on types. InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) { Intrinsic::ID IID = ICA.getID(); Type *RetTy = ICA.getReturnType(); const SmallVectorImpl &Tys = ICA.getArgTypes(); FastMathFlags FMF = ICA.getFlags(); InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); bool SkipScalarizationCost = ICA.skipScalarizationCost(); VectorType *VecOpTy = nullptr; if (!Tys.empty()) { // The vector reduction operand is operand 0 except for fadd/fmul. // Their operand 0 is a scalar start value, so the vector op is operand 1. unsigned VecTyIndex = 0; if (IID == Intrinsic::vector_reduce_fadd || IID == Intrinsic::vector_reduce_fmul) VecTyIndex = 1; assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); VecOpTy = dyn_cast(Tys[VecTyIndex]); } // Library call cost - other than size, make it expensive. unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; unsigned ISD = 0; switch (IID) { default: { // Scalable vectors cannot be scalarized, so return Invalid. if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { return isa(Ty); })) return InstructionCost::getInvalid(); // Assume that we need to scalarize this intrinsic. InstructionCost ScalarizationCost = SkipScalarizationCost ? ScalarizationCostPassed : 0; unsigned ScalarCalls = 1; Type *ScalarRetTy = RetTy; if (auto *RetVTy = dyn_cast(RetTy)) { if (!SkipScalarizationCost) ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); ScalarCalls = std::max(ScalarCalls, cast(RetVTy)->getNumElements()); ScalarRetTy = RetTy->getScalarType(); } SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (auto *VTy = dyn_cast(Ty)) { if (!SkipScalarizationCost) ScalarizationCost += getScalarizationOverhead(VTy, false, true); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); Ty = Ty->getScalarType(); } ScalarTys.push_back(Ty); } if (ScalarCalls == 1) return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); InstructionCost ScalarCost = thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); return ScalarCalls * ScalarCost + ScalarizationCost; } // Look for intrinsics that can be lowered directly or turned into a scalar // intrinsic call. case Intrinsic::sqrt: ISD = ISD::FSQRT; break; case Intrinsic::sin: ISD = ISD::FSIN; break; case Intrinsic::cos: ISD = ISD::FCOS; break; case Intrinsic::exp: ISD = ISD::FEXP; break; case Intrinsic::exp2: ISD = ISD::FEXP2; break; case Intrinsic::log: ISD = ISD::FLOG; break; case Intrinsic::log10: ISD = ISD::FLOG10; break; case Intrinsic::log2: ISD = ISD::FLOG2; break; case Intrinsic::fabs: ISD = ISD::FABS; break; case Intrinsic::canonicalize: ISD = ISD::FCANONICALIZE; break; case Intrinsic::minnum: ISD = ISD::FMINNUM; break; case Intrinsic::maxnum: ISD = ISD::FMAXNUM; break; case Intrinsic::minimum: ISD = ISD::FMINIMUM; break; case Intrinsic::maximum: ISD = ISD::FMAXIMUM; break; case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break; case Intrinsic::floor: ISD = ISD::FFLOOR; break; case Intrinsic::ceil: ISD = ISD::FCEIL; break; case Intrinsic::trunc: ISD = ISD::FTRUNC; break; case Intrinsic::nearbyint: ISD = ISD::FNEARBYINT; break; case Intrinsic::rint: ISD = ISD::FRINT; break; case Intrinsic::round: ISD = ISD::FROUND; break; case Intrinsic::roundeven: ISD = ISD::FROUNDEVEN; break; case Intrinsic::pow: ISD = ISD::FPOW; break; case Intrinsic::fma: ISD = ISD::FMA; break; case Intrinsic::fmuladd: ISD = ISD::FMA; break; case Intrinsic::experimental_constrained_fmuladd: ISD = ISD::STRICT_FMA; break; // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. case Intrinsic::lifetime_start: case Intrinsic::lifetime_end: case Intrinsic::sideeffect: case Intrinsic::pseudoprobe: case Intrinsic::arithmetic_fence: return 0; case Intrinsic::masked_store: { Type *Ty = Tys[0]; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, CostKind); } case Intrinsic::masked_load: { Type *Ty = RetTy; Align TyAlign = thisT()->DL.getABITypeAlign(Ty); return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, CostKind); } case Intrinsic::vector_reduce_add: return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_mul: return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_and: return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_or: return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_xor: return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, None, CostKind); case Intrinsic::vector_reduce_fadd: return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, FMF, CostKind); case Intrinsic::vector_reduce_fmul: return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, FMF, CostKind); case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_smin: case Intrinsic::vector_reduce_fmax: case Intrinsic::vector_reduce_fmin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), /*IsUnsigned=*/false, CostKind); case Intrinsic::vector_reduce_umax: case Intrinsic::vector_reduce_umin: return thisT()->getMinMaxReductionCost( VecOpTy, cast(CmpInst::makeCmpResultType(VecOpTy)), /*IsUnsigned=*/true, CostKind); case Intrinsic::abs: { // abs(X) = select(icmp(X,0),X,sub(0,X)) Type *CondTy = RetTy->getWithNewBitWidth(1); CmpInst::Predicate Pred = CmpInst::ICMP_SGT; InstructionCost Cost = 0; Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred, CostKind); // TODO: Should we add an OperandValueProperties::OP_Zero property? Cost += thisT()->getArithmeticInstrCost( BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); return Cost; } case Intrinsic::smax: case Intrinsic::smin: case Intrinsic::umax: case Intrinsic::umin: { // minmax(X,Y) = select(icmp(X,Y),X,Y) Type *CondTy = RetTy->getWithNewBitWidth(1); bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; CmpInst::Predicate Pred = IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; InstructionCost Cost = 0; Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred, CostKind); return Cost; } case Intrinsic::sadd_sat: case Intrinsic::ssub_sat: { Type *CondTy = RetTy->getWithNewBitWidth(1); Type *OpTy = StructType::create({RetTy, CondTy}); Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat ? Intrinsic::sadd_with_overflow : Intrinsic::ssub_with_overflow; CmpInst::Predicate Pred = CmpInst::ICMP_SGT; // SatMax -> Overflow && SumDiff < 0 // SatMin -> Overflow && SumDiff >= 0 InstructionCost Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, nullptr, ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred, CostKind); Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred, CostKind); return Cost; } case Intrinsic::uadd_sat: case Intrinsic::usub_sat: { Type *CondTy = RetTy->getWithNewBitWidth(1); Type *OpTy = StructType::create({RetTy, CondTy}); Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat ? Intrinsic::uadd_with_overflow : Intrinsic::usub_with_overflow; InstructionCost Cost = 0; IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, nullptr, ScalarizationCostPassed); Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); return Cost; } case Intrinsic::smul_fix: case Intrinsic::umul_fix: { unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); unsigned ExtOp = IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; TTI::CastContextHint CCH = TTI::CastContextHint::None; InstructionCost Cost = 0; Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); return Cost; } case Intrinsic::sadd_with_overflow: case Intrinsic::ssub_with_overflow: { Type *SumTy = RetTy->getContainedType(0); Type *OverflowTy = RetTy->getContainedType(1); unsigned Opcode = IID == Intrinsic::sadd_with_overflow ? BinaryOperator::Add : BinaryOperator::Sub; // Add: // Overflow -> (Result < LHS) ^ (RHS < 0) // Sub: // Overflow -> (Result < LHS) ^ (RHS > 0) InstructionCost Cost = 0; Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); Cost += 2 * thisT()->getCmpSelInstrCost( Instruction::ICmp, SumTy, OverflowTy, CmpInst::ICMP_SGT, CostKind); Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, CostKind); return Cost; } case Intrinsic::uadd_with_overflow: case Intrinsic::usub_with_overflow: { Type *SumTy = RetTy->getContainedType(0); Type *OverflowTy = RetTy->getContainedType(1); unsigned Opcode = IID == Intrinsic::uadd_with_overflow ? BinaryOperator::Add : BinaryOperator::Sub; CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGT; InstructionCost Cost = 0; Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, Pred, CostKind); return Cost; } case Intrinsic::smul_with_overflow: case Intrinsic::umul_with_overflow: { Type *MulTy = RetTy->getContainedType(0); Type *OverflowTy = RetTy->getContainedType(1); unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); bool IsSigned = IID == Intrinsic::smul_with_overflow; unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; TTI::CastContextHint CCH = TTI::CastContextHint::None; InstructionCost Cost = 0; Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, CCH, CostKind); Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); if (IsSigned) Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, CostKind, TTI::OK_AnyValue, TTI::OK_UniformConstantValue); Cost += thisT()->getCmpSelInstrCost( BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); return Cost; } case Intrinsic::fptosi_sat: case Intrinsic::fptoui_sat: { if (Tys.empty()) break; Type *FromTy = Tys[0]; bool IsSigned = IID == Intrinsic::fptosi_sat; InstructionCost Cost = 0; IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, {FromTy, FromTy}); Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, {FromTy, FromTy}); Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); Cost += thisT()->getCastInstrCost( IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, TTI::CastContextHint::None, CostKind); if (IsSigned) { Type *CondTy = RetTy->getWithNewBitWidth(1); Cost += thisT()->getCmpSelInstrCost( BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); Cost += thisT()->getCmpSelInstrCost( BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); } return Cost; } case Intrinsic::ctpop: ISD = ISD::CTPOP; // In case of legalization use TCC_Expensive. This is cheaper than a // library call but still not a cheap instruction. SingleCallCost = TargetTransformInfo::TCC_Expensive; break; case Intrinsic::ctlz: ISD = ISD::CTLZ; break; case Intrinsic::cttz: ISD = ISD::CTTZ; break; case Intrinsic::bswap: ISD = ISD::BSWAP; break; case Intrinsic::bitreverse: ISD = ISD::BITREVERSE; break; } const TargetLoweringBase *TLI = getTLI(); std::pair LT = TLI->getTypeLegalizationCost(DL, RetTy); if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && TLI->isFAbsFree(LT.second)) { return 0; } // The operation is legal. Assume it costs 1. // If the type is split to multiple registers, assume that there is some // overhead to this. // TODO: Once we have extract/insert subvector cost we need to use them. if (LT.first > 1) return (LT.first * 2); else return (LT.first * 1); } else if (!TLI->isOperationExpand(ISD, LT.second)) { // If the operation is custom lowered then assume // that the code is twice as expensive. return (LT.first * 2); } // If we can't lower fmuladd into an FMA estimate the cost as a floating // point mul followed by an add. if (IID == Intrinsic::fmuladd) return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, CostKind) + thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, CostKind); if (IID == Intrinsic::experimental_constrained_fmuladd) { IntrinsicCostAttributes FMulAttrs( Intrinsic::experimental_constrained_fmul, RetTy, Tys); IntrinsicCostAttributes FAddAttrs( Intrinsic::experimental_constrained_fadd, RetTy, Tys); return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); } // Else, assume that we need to scalarize this intrinsic. For math builtins // this will emit a costly libcall, adding call overhead and spills. Make it // very expensive. if (auto *RetVTy = dyn_cast(RetTy)) { // Scalable vectors cannot be scalarized, so return Invalid. if (isa(RetTy) || any_of(Tys, [](const Type *Ty) { return isa(Ty); })) return InstructionCost::getInvalid(); InstructionCost ScalarizationCost = SkipScalarizationCost ? ScalarizationCostPassed : getScalarizationOverhead(RetVTy, true, false); unsigned ScalarCalls = cast(RetVTy)->getNumElements(); SmallVector ScalarTys; for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { Type *Ty = Tys[i]; if (Ty->isVectorTy()) Ty = Ty->getScalarType(); ScalarTys.push_back(Ty); } IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); InstructionCost ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { if (auto *VTy = dyn_cast(Tys[i])) { if (!ICA.skipScalarizationCost()) ScalarizationCost += getScalarizationOverhead(VTy, false, true); ScalarCalls = std::max(ScalarCalls, cast(VTy)->getNumElements()); } } return ScalarCalls * ScalarCost + ScalarizationCost; } // This is going to be turned into a library call, make it expensive. return SingleCallCost; } /// Compute a cost of the given call instruction. /// /// Compute the cost of calling function F with return type RetTy and /// argument types Tys. F might be nullptr, in this case the cost of an /// arbitrary call with the specified signature will be returned. /// This is used, for instance, when we estimate call of a vector /// counterpart of the given function. /// \param F Called function, might be nullptr. /// \param RetTy Return value types. /// \param Tys Argument types. /// \returns The cost of Call instruction. InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind) { return 10; } unsigned getNumberOfParts(Type *Tp) { std::pair LT = getTLI()->getTypeLegalizationCost(DL, Tp); return LT.first.isValid() ? *LT.first.getValue() : 0; } InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, const SCEV *) { return 0; } /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. /// We're assuming that reduction operation are performing the following way: /// /// %val1 = shufflevector %val, %undef, /// /// \----------------v-------------/ \----------v------------/ /// n/2 elements n/2 elements /// %red1 = op %val, val1 /// After this operation we have a vector %red1 where only the first n/2 /// elements are meaningful, the second n/2 elements are undefined and can be /// dropped. All other operations are actually working with the vector of /// length n/2, not n, though the real vector length is still n. /// %val2 = shufflevector %red1, %undef, /// /// \----------------v-------------/ \----------v------------/ /// n/4 elements 3*n/4 elements /// %red2 = op %red1, val2 - working with the vector of /// length n/2, the resulting vector has length n/4 etc. /// /// The cost model should take into account that the actual length of the /// vector is reduced on each iteration. InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) { // Targets must implement a default value for the scalable case, since // we don't know how many lanes the vector has. if (isa(Ty)) return InstructionCost::getInvalid(); Type *ScalarTy = Ty->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); if ((Opcode == Instruction::Or || Opcode == Instruction::And) && ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && NumVecElts >= 2) { // Or reduction for i1 is represented as: // %val = bitcast to iReduxWidth // %res = cmp ne iReduxWidth %val, 0 // And reduction for i1 is represented as: // %val = bitcast to iReduxWidth // %res = cmp eq iReduxWidth %val, 11111 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, TTI::CastContextHint::None, CostKind) + thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, CmpInst::makeCmpResultType(ValTy), CmpInst::BAD_ICMP_PREDICATE, CostKind); } unsigned NumReduxLevels = Log2_32(NumVecElts); InstructionCost ArithCost = 0; InstructionCost ShuffleCost = 0; std::pair LT = thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; while (NumVecElts > MVTLen) { NumVecElts /= 2; VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); Ty = SubTy; ++LongVectorCount; } NumReduxLevels -= LongVectorCount; // The minimal length of the vector is limited by the real length of vector // operations performed on the current platform. That's why several final // reduction operations are performed on the vectors with the same // architecture-dependent length. // By default reductions need one shuffle per reduction level. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); return ShuffleCost + ArithCost + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } /// Try to calculate the cost of performing strict (in-order) reductions, /// which involves doing a sequence of floating point additions in lane /// order, starting with an initial value. For example, consider a scalar /// initial value 'InitVal' of type float and a vector of type <4 x float>: /// /// Vector = /// /// %add1 = %InitVal + %v0 /// %add2 = %add1 + %v1 /// %add3 = %add2 + %v2 /// %add4 = %add3 + %v3 /// /// As a simple estimate we can say the cost of such a reduction is 4 times /// the cost of a scalar FP addition. We can only estimate the costs for /// fixed-width vectors here because for scalable vectors we do not know the /// runtime number of operations. InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) { // Targets must implement a default value for the scalable case, since // we don't know how many lanes the vector has. if (isa(Ty)) return InstructionCost::getInvalid(); auto *VTy = cast(Ty); InstructionCost ExtractCost = getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); InstructionCost ArithCost = thisT()->getArithmeticInstrCost( Opcode, VTy->getElementType(), CostKind); ArithCost *= VTy->getNumElements(); return ExtractCost + ArithCost; } InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, Optional FMF, TTI::TargetCostKind CostKind) { if (TTI::requiresOrderedReduction(FMF)) return getOrderedReductionCost(Opcode, Ty, CostKind); return getTreeReductionCost(Opcode, Ty, CostKind); } /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind) { // Targets must implement a default value for the scalable case, since // we don't know how many lanes the vector has. if (isa(Ty)) return InstructionCost::getInvalid(); Type *ScalarTy = Ty->getElementType(); Type *ScalarCondTy = CondTy->getElementType(); unsigned NumVecElts = cast(Ty)->getNumElements(); unsigned NumReduxLevels = Log2_32(NumVecElts); unsigned CmpOpcode; if (Ty->isFPOrFPVectorTy()) { CmpOpcode = Instruction::FCmp; } else { assert(Ty->isIntOrIntVectorTy() && "expecting floating point or integer type for min/max reduction"); CmpOpcode = Instruction::ICmp; } InstructionCost MinMaxCost = 0; InstructionCost ShuffleCost = 0; std::pair LT = thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); unsigned LongVectorCount = 0; unsigned MVTLen = LT.second.isVector() ? LT.second.getVectorNumElements() : 1; while (NumVecElts > MVTLen) { NumVecElts /= 2; auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, NumVecElts, SubTy); MinMaxCost += thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind) + thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind); Ty = SubTy; ++LongVectorCount; } NumReduxLevels -= LongVectorCount; // The minimal length of the vector is limited by the real length of vector // operations performed on the current platform. That's why several final // reduction opertions are perfomed on the vectors with the same // architecture-dependent length. ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); MinMaxCost += NumReduxLevels * (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind) + thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CmpInst::BAD_ICMP_PREDICATE, CostKind)); // The last min/max should be in vector registers and we counted it above. // So just need a single extractelement. return ShuffleCost + MinMaxCost + thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) { // Without any native support, this is equivalent to the cost of // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) VectorType *ExtTy = VectorType::get(ResTy, Ty); InstructionCost RedCost = thisT()->getArithmeticReductionCost( Instruction::Add, ExtTy, None, CostKind); InstructionCost MulCost = 0; InstructionCost ExtCost = thisT()->getCastInstrCost( IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, TTI::CastContextHint::None, CostKind); if (IsMLA) { MulCost = thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); ExtCost *= 2; } return RedCost + MulCost + ExtCost; } InstructionCost getVectorSplitCost() { return 1; } /// @} }; /// Concrete BasicTTIImpl that can be used if no further customization /// is needed. class BasicTTIImpl : public BasicTTIImplBase { using BaseT = BasicTTIImplBase; friend class BasicTTIImplBase; const TargetSubtargetInfo *ST; const TargetLoweringBase *TLI; const TargetSubtargetInfo *getST() const { return ST; } const TargetLoweringBase *getTLI() const { return TLI; } public: explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); }; } // end namespace llvm #endif // LLVM_CODEGEN_BASICTTIIMPL_H