1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/TargetTransformInfoImpl.h" 27 #include "llvm/CodeGen/ISDOpcodes.h" 28 #include "llvm/CodeGen/TargetLowering.h" 29 #include "llvm/CodeGen/TargetSubtargetInfo.h" 30 #include "llvm/CodeGen/ValueTypes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/Constant.h" 33 #include "llvm/IR/Constants.h" 34 #include "llvm/IR/DataLayout.h" 35 #include "llvm/IR/DerivedTypes.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/Intrinsics.h" 40 #include "llvm/IR/Operator.h" 41 #include "llvm/IR/Type.h" 42 #include "llvm/IR/Value.h" 43 #include "llvm/Support/Casting.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/MachineValueType.h" 47 #include "llvm/Support/MathExtras.h" 48 #include <algorithm> 49 #include <cassert> 50 #include <cstdint> 51 #include <limits> 52 #include <utility> 53 54 namespace llvm { 55 56 class Function; 57 class GlobalValue; 58 class LLVMContext; 59 class ScalarEvolution; 60 class SCEV; 61 class TargetMachine; 62 63 extern cl::opt<unsigned> PartialUnrollingThreshold; 64 65 /// Base class which can be used to help build a TTI implementation. 66 /// 67 /// This class provides as much implementation of the TTI interface as is 68 /// possible using the target independent parts of the code generator. 69 /// 70 /// In order to subclass it, your class must implement a getST() method to 71 /// return the subtarget, and a getTLI() method to return the target lowering. 72 /// We need these methods implemented in the derived class so that this class 73 /// doesn't have to duplicate storage for them. 74 template <typename T> 75 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 76 private: 77 using BaseT = TargetTransformInfoImplCRTPBase<T>; 78 using TTI = TargetTransformInfo; 79 80 /// Helper function to access this as a T. thisT()81 T *thisT() { return static_cast<T *>(this); } 82 83 /// Estimate a cost of Broadcast as an extract and sequence of insert 84 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy)85 unsigned getBroadcastShuffleOverhead(FixedVectorType *VTy) { 86 unsigned Cost = 0; 87 // Broadcast cost is equal to the cost of extracting the zero'th element 88 // plus the cost of inserting it into every element of the result vector. 89 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); 90 91 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 92 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 93 } 94 return Cost; 95 } 96 97 /// Estimate a cost of shuffle as a sequence of extract and insert 98 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy)99 unsigned getPermuteShuffleOverhead(FixedVectorType *VTy) { 100 unsigned Cost = 0; 101 // Shuffle cost is equal to the cost of extracting element from its argument 102 // plus the cost of inserting them onto the result vector. 103 104 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 105 // index 0 of first vector, index 1 of second vector,index 2 of first 106 // vector and finally index 3 of second vector and insert them at index 107 // <0,1,2,3> of result vector. 108 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 109 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 110 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); 111 } 112 return Cost; 113 } 114 115 /// Estimate a cost of subvector extraction as a sequence of extract and 116 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)117 unsigned getExtractSubvectorOverhead(VectorType *VTy, int Index, 118 FixedVectorType *SubVTy) { 119 assert(VTy && SubVTy && 120 "Can only extract subvectors from vectors"); 121 int NumSubElts = SubVTy->getNumElements(); 122 assert((!isa<FixedVectorType>(VTy) || 123 (Index + NumSubElts) <= 124 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 125 "SK_ExtractSubvector index out of range"); 126 127 unsigned Cost = 0; 128 // Subvector extraction cost is equal to the cost of extracting element from 129 // the source type plus the cost of inserting them into the result vector 130 // type. 131 for (int i = 0; i != NumSubElts; ++i) { 132 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 133 i + Index); 134 Cost += 135 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); 136 } 137 return Cost; 138 } 139 140 /// Estimate a cost of subvector insertion as a sequence of extract and 141 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)142 unsigned getInsertSubvectorOverhead(VectorType *VTy, int Index, 143 FixedVectorType *SubVTy) { 144 assert(VTy && SubVTy && 145 "Can only insert subvectors into vectors"); 146 int NumSubElts = SubVTy->getNumElements(); 147 assert((!isa<FixedVectorType>(VTy) || 148 (Index + NumSubElts) <= 149 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 150 "SK_InsertSubvector index out of range"); 151 152 unsigned Cost = 0; 153 // Subvector insertion cost is equal to the cost of extracting element from 154 // the source type plus the cost of inserting them into the result vector 155 // type. 156 for (int i = 0; i != NumSubElts; ++i) { 157 Cost += 158 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); 159 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 160 i + Index); 161 } 162 return Cost; 163 } 164 165 /// Local query method delegates up to T which *must* implement this! getST()166 const TargetSubtargetInfo *getST() const { 167 return static_cast<const T *>(this)->getST(); 168 } 169 170 /// Local query method delegates up to T which *must* implement this! getTLI()171 const TargetLoweringBase *getTLI() const { 172 return static_cast<const T *>(this)->getTLI(); 173 } 174 getISDIndexedMode(TTI::MemIndexedMode M)175 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 176 switch (M) { 177 case TTI::MIM_Unindexed: 178 return ISD::UNINDEXED; 179 case TTI::MIM_PreInc: 180 return ISD::PRE_INC; 181 case TTI::MIM_PreDec: 182 return ISD::PRE_DEC; 183 case TTI::MIM_PostInc: 184 return ISD::POST_INC; 185 case TTI::MIM_PostDec: 186 return ISD::POST_DEC; 187 } 188 llvm_unreachable("Unexpected MemIndexedMode"); 189 } 190 191 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)192 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 193 : BaseT(DL) {} 194 virtual ~BasicTTIImplBase() = default; 195 196 using TargetTransformInfoImplBase::DL; 197 198 public: 199 /// \name Scalar TTI Implementations 200 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,unsigned Alignment,bool * Fast)201 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 202 unsigned AddressSpace, unsigned Alignment, 203 bool *Fast) const { 204 EVT E = EVT::getIntegerVT(Context, BitWidth); 205 return getTLI()->allowsMisalignedMemoryAccesses( 206 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 207 } 208 hasBranchDivergence()209 bool hasBranchDivergence() { return false; } 210 useGPUDivergenceAnalysis()211 bool useGPUDivergenceAnalysis() { return false; } 212 isSourceOfDivergence(const Value * V)213 bool isSourceOfDivergence(const Value *V) { return false; } 214 isAlwaysUniform(const Value * V)215 bool isAlwaysUniform(const Value *V) { return false; } 216 getFlatAddressSpace()217 unsigned getFlatAddressSpace() { 218 // Return an invalid address space. 219 return -1; 220 } 221 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)222 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 223 Intrinsic::ID IID) const { 224 return false; 225 } 226 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)227 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 228 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 229 } 230 getAssumedAddrSpace(const Value * V)231 unsigned getAssumedAddrSpace(const Value *V) const { 232 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 233 } 234 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)235 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 236 Value *NewV) const { 237 return nullptr; 238 } 239 isLegalAddImmediate(int64_t imm)240 bool isLegalAddImmediate(int64_t imm) { 241 return getTLI()->isLegalAddImmediate(imm); 242 } 243 isLegalICmpImmediate(int64_t imm)244 bool isLegalICmpImmediate(int64_t imm) { 245 return getTLI()->isLegalICmpImmediate(imm); 246 } 247 248 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 249 bool HasBaseReg, int64_t Scale, 250 unsigned AddrSpace, Instruction *I = nullptr) { 251 TargetLoweringBase::AddrMode AM; 252 AM.BaseGV = BaseGV; 253 AM.BaseOffs = BaseOffset; 254 AM.HasBaseReg = HasBaseReg; 255 AM.Scale = Scale; 256 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 257 } 258 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)259 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 260 const DataLayout &DL) const { 261 EVT VT = getTLI()->getValueType(DL, Ty); 262 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 263 } 264 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)265 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 266 const DataLayout &DL) const { 267 EVT VT = getTLI()->getValueType(DL, Ty); 268 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 269 } 270 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)271 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 272 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 273 } 274 isNumRegsMajorCostOfLSR()275 bool isNumRegsMajorCostOfLSR() { 276 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 277 } 278 isProfitableLSRChainElement(Instruction * I)279 bool isProfitableLSRChainElement(Instruction *I) { 280 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 281 } 282 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)283 int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 284 bool HasBaseReg, int64_t Scale, unsigned AddrSpace) { 285 TargetLoweringBase::AddrMode AM; 286 AM.BaseGV = BaseGV; 287 AM.BaseOffs = BaseOffset; 288 AM.HasBaseReg = HasBaseReg; 289 AM.Scale = Scale; 290 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 291 } 292 isTruncateFree(Type * Ty1,Type * Ty2)293 bool isTruncateFree(Type *Ty1, Type *Ty2) { 294 return getTLI()->isTruncateFree(Ty1, Ty2); 295 } 296 isProfitableToHoist(Instruction * I)297 bool isProfitableToHoist(Instruction *I) { 298 return getTLI()->isProfitableToHoist(I); 299 } 300 useAA()301 bool useAA() const { return getST()->useAA(); } 302 isTypeLegal(Type * Ty)303 bool isTypeLegal(Type *Ty) { 304 EVT VT = getTLI()->getValueType(DL, Ty); 305 return getTLI()->isTypeLegal(VT); 306 } 307 getRegUsageForType(Type * Ty)308 unsigned getRegUsageForType(Type *Ty) { 309 return getTLI()->getTypeLegalizationCost(DL, Ty).first; 310 } 311 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)312 int getGEPCost(Type *PointeeType, const Value *Ptr, 313 ArrayRef<const Value *> Operands) { 314 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 315 } 316 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)317 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 318 unsigned &JumpTableSize, 319 ProfileSummaryInfo *PSI, 320 BlockFrequencyInfo *BFI) { 321 /// Try to find the estimated number of clusters. Note that the number of 322 /// clusters identified in this function could be different from the actual 323 /// numbers found in lowering. This function ignore switches that are 324 /// lowered with a mix of jump table / bit test / BTree. This function was 325 /// initially intended to be used when estimating the cost of switch in 326 /// inline cost heuristic, but it's a generic cost model to be used in other 327 /// places (e.g., in loop unrolling). 328 unsigned N = SI.getNumCases(); 329 const TargetLoweringBase *TLI = getTLI(); 330 const DataLayout &DL = this->getDataLayout(); 331 332 JumpTableSize = 0; 333 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 334 335 // Early exit if both a jump table and bit test are not allowed. 336 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 337 return N; 338 339 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 340 APInt MinCaseVal = MaxCaseVal; 341 for (auto CI : SI.cases()) { 342 const APInt &CaseVal = CI.getCaseValue()->getValue(); 343 if (CaseVal.sgt(MaxCaseVal)) 344 MaxCaseVal = CaseVal; 345 if (CaseVal.slt(MinCaseVal)) 346 MinCaseVal = CaseVal; 347 } 348 349 // Check if suitable for a bit test 350 if (N <= DL.getIndexSizeInBits(0u)) { 351 SmallPtrSet<const BasicBlock *, 4> Dests; 352 for (auto I : SI.cases()) 353 Dests.insert(I.getCaseSuccessor()); 354 355 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 356 DL)) 357 return 1; 358 } 359 360 // Check if suitable for a jump table. 361 if (IsJTAllowed) { 362 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 363 return N; 364 uint64_t Range = 365 (MaxCaseVal - MinCaseVal) 366 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 367 // Check whether a range of clusters is dense enough for a jump table 368 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 369 JumpTableSize = Range; 370 return 1; 371 } 372 } 373 return N; 374 } 375 shouldBuildLookupTables()376 bool shouldBuildLookupTables() { 377 const TargetLoweringBase *TLI = getTLI(); 378 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 379 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 380 } 381 haveFastSqrt(Type * Ty)382 bool haveFastSqrt(Type *Ty) { 383 const TargetLoweringBase *TLI = getTLI(); 384 EVT VT = TLI->getValueType(DL, Ty); 385 return TLI->isTypeLegal(VT) && 386 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 387 } 388 isFCmpOrdCheaperThanFCmpZero(Type * Ty)389 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 390 return true; 391 } 392 getFPOpCost(Type * Ty)393 unsigned getFPOpCost(Type *Ty) { 394 // Check whether FADD is available, as a proxy for floating-point in 395 // general. 396 const TargetLoweringBase *TLI = getTLI(); 397 EVT VT = TLI->getValueType(DL, Ty); 398 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 399 return TargetTransformInfo::TCC_Basic; 400 return TargetTransformInfo::TCC_Expensive; 401 } 402 getInliningThresholdMultiplier()403 unsigned getInliningThresholdMultiplier() { return 1; } adjustInliningThreshold(const CallBase * CB)404 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 405 getInlinerVectorBonusPercent()406 int getInlinerVectorBonusPercent() { return 150; } 407 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)408 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 409 TTI::UnrollingPreferences &UP) { 410 // This unrolling functionality is target independent, but to provide some 411 // motivation for its intended use, for x86: 412 413 // According to the Intel 64 and IA-32 Architectures Optimization Reference 414 // Manual, Intel Core models and later have a loop stream detector (and 415 // associated uop queue) that can benefit from partial unrolling. 416 // The relevant requirements are: 417 // - The loop must have no more than 4 (8 for Nehalem and later) branches 418 // taken, and none of them may be calls. 419 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 420 421 // According to the Software Optimization Guide for AMD Family 15h 422 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 423 // and loop buffer which can benefit from partial unrolling. 424 // The relevant requirements are: 425 // - The loop must have fewer than 16 branches 426 // - The loop must have less than 40 uops in all executed loop branches 427 428 // The number of taken branches in a loop is hard to estimate here, and 429 // benchmarking has revealed that it is better not to be conservative when 430 // estimating the branch count. As a result, we'll ignore the branch limits 431 // until someone finds a case where it matters in practice. 432 433 unsigned MaxOps; 434 const TargetSubtargetInfo *ST = getST(); 435 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 436 MaxOps = PartialUnrollingThreshold; 437 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 438 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 439 else 440 return; 441 442 // Scan the loop: don't unroll loops with calls. 443 for (BasicBlock *BB : L->blocks()) { 444 for (Instruction &I : *BB) { 445 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 446 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 447 if (!thisT()->isLoweredToCall(F)) 448 continue; 449 } 450 451 return; 452 } 453 } 454 } 455 456 // Enable runtime and partial unrolling up to the specified size. 457 // Enable using trip count upper bound to unroll loops. 458 UP.Partial = UP.Runtime = UP.UpperBound = true; 459 UP.PartialThreshold = MaxOps; 460 461 // Avoid unrolling when optimizing for size. 462 UP.OptSizeThreshold = 0; 463 UP.PartialOptSizeThreshold = 0; 464 465 // Set number of instructions optimized when "back edge" 466 // becomes "fall through" to default value of 2. 467 UP.BEInsns = 2; 468 } 469 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)470 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 471 TTI::PeelingPreferences &PP) { 472 PP.PeelCount = 0; 473 PP.AllowPeeling = true; 474 PP.AllowLoopNestsPeeling = false; 475 PP.PeelProfiledIterations = true; 476 } 477 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)478 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 479 AssumptionCache &AC, 480 TargetLibraryInfo *LibInfo, 481 HardwareLoopInfo &HWLoopInfo) { 482 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 483 } 484 preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)485 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 486 AssumptionCache &AC, TargetLibraryInfo *TLI, 487 DominatorTree *DT, 488 const LoopAccessInfo *LAI) { 489 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); 490 } 491 emitGetActiveLaneMask()492 bool emitGetActiveLaneMask() { 493 return BaseT::emitGetActiveLaneMask(); 494 } 495 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)496 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 497 IntrinsicInst &II) { 498 return BaseT::instCombineIntrinsic(IC, II); 499 } 500 simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)501 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, 502 IntrinsicInst &II, 503 APInt DemandedMask, 504 KnownBits &Known, 505 bool &KnownBitsComputed) { 506 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 507 KnownBitsComputed); 508 } 509 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)510 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 511 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 512 APInt &UndefElts2, APInt &UndefElts3, 513 std::function<void(Instruction *, unsigned, APInt, APInt &)> 514 SimplifyAndSetOp) { 515 return BaseT::simplifyDemandedVectorEltsIntrinsic( 516 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 517 SimplifyAndSetOp); 518 } 519 getInstructionLatency(const Instruction * I)520 int getInstructionLatency(const Instruction *I) { 521 if (isa<LoadInst>(I)) 522 return getST()->getSchedModel().DefaultLoadLatency; 523 524 return BaseT::getInstructionLatency(I); 525 } 526 527 virtual Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)528 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 529 return Optional<unsigned>( 530 getST()->getCacheSize(static_cast<unsigned>(Level))); 531 } 532 533 virtual Optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)534 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 535 Optional<unsigned> TargetResult = 536 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 537 538 if (TargetResult) 539 return TargetResult; 540 541 return BaseT::getCacheAssociativity(Level); 542 } 543 getCacheLineSize()544 virtual unsigned getCacheLineSize() const { 545 return getST()->getCacheLineSize(); 546 } 547 getPrefetchDistance()548 virtual unsigned getPrefetchDistance() const { 549 return getST()->getPrefetchDistance(); 550 } 551 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)552 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 553 unsigned NumStridedMemAccesses, 554 unsigned NumPrefetches, 555 bool HasCall) const { 556 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 557 NumPrefetches, HasCall); 558 } 559 getMaxPrefetchIterationsAhead()560 virtual unsigned getMaxPrefetchIterationsAhead() const { 561 return getST()->getMaxPrefetchIterationsAhead(); 562 } 563 enableWritePrefetching()564 virtual bool enableWritePrefetching() const { 565 return getST()->enableWritePrefetching(); 566 } 567 568 /// @} 569 570 /// \name Vector TTI Implementations 571 /// @{ 572 getRegisterBitWidth(bool Vector)573 unsigned getRegisterBitWidth(bool Vector) const { return 32; } 574 getMaxVScale()575 Optional<unsigned> getMaxVScale() const { return None; } 576 577 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 578 /// are set if the demanded result elements need to be inserted and/or 579 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)580 unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, 581 bool Insert, bool Extract) { 582 /// FIXME: a bitfield is not a reasonable abstraction for talking about 583 /// which elements are needed from a scalable vector 584 auto *Ty = cast<FixedVectorType>(InTy); 585 586 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 587 "Vector size mismatch"); 588 589 unsigned Cost = 0; 590 591 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 592 if (!DemandedElts[i]) 593 continue; 594 if (Insert) 595 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); 596 if (Extract) 597 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 598 } 599 600 return Cost; 601 } 602 603 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)604 unsigned getScalarizationOverhead(VectorType *InTy, bool Insert, 605 bool Extract) { 606 auto *Ty = cast<FixedVectorType>(InTy); 607 608 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); 609 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 610 } 611 612 /// Estimate the overhead of scalarizing an instruction's unique 613 /// non-constant operands. The types of the arguments are ordinarily 614 /// scalar, in which case the costs are multiplied with VF. getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,unsigned VF)615 unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 616 unsigned VF) { 617 unsigned Cost = 0; 618 SmallPtrSet<const Value*, 4> UniqueOperands; 619 for (const Value *A : Args) { 620 // Disregard things like metadata arguments. 621 Type *Ty = A->getType(); 622 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 623 !Ty->isPtrOrPtrVectorTy()) 624 continue; 625 626 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 627 auto *VecTy = dyn_cast<VectorType>(Ty); 628 if (VecTy) { 629 // If A is a vector operand, VF should be 1 or correspond to A. 630 assert((VF == 1 || 631 VF == cast<FixedVectorType>(VecTy)->getNumElements()) && 632 "Vector argument does not match VF"); 633 } 634 else 635 VecTy = FixedVectorType::get(Ty, VF); 636 637 Cost += getScalarizationOverhead(VecTy, false, true); 638 } 639 } 640 641 return Cost; 642 } 643 getScalarizationOverhead(VectorType * InTy,ArrayRef<const Value * > Args)644 unsigned getScalarizationOverhead(VectorType *InTy, 645 ArrayRef<const Value *> Args) { 646 auto *Ty = cast<FixedVectorType>(InTy); 647 648 unsigned Cost = 0; 649 650 Cost += getScalarizationOverhead(Ty, true, false); 651 if (!Args.empty()) 652 Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); 653 else 654 // When no information on arguments is provided, we add the cost 655 // associated with one argument as a heuristic. 656 Cost += getScalarizationOverhead(Ty, false, true); 657 658 return Cost; 659 } 660 getMaxInterleaveFactor(unsigned VF)661 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 662 663 unsigned getArithmeticInstrCost( 664 unsigned Opcode, Type *Ty, 665 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 666 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 667 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 668 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 669 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 670 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 671 const Instruction *CxtI = nullptr) { 672 // Check if any of the operands are vector operands. 673 const TargetLoweringBase *TLI = getTLI(); 674 int ISD = TLI->InstructionOpcodeToISD(Opcode); 675 assert(ISD && "Invalid opcode"); 676 677 // TODO: Handle more cost kinds. 678 if (CostKind != TTI::TCK_RecipThroughput) 679 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 680 Opd1Info, Opd2Info, 681 Opd1PropInfo, Opd2PropInfo, 682 Args, CxtI); 683 684 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 685 686 bool IsFloat = Ty->isFPOrFPVectorTy(); 687 // Assume that floating point arithmetic operations cost twice as much as 688 // integer operations. 689 unsigned OpCost = (IsFloat ? 2 : 1); 690 691 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 692 // The operation is legal. Assume it costs 1. 693 // TODO: Once we have extract/insert subvector cost we need to use them. 694 return LT.first * OpCost; 695 } 696 697 if (!TLI->isOperationExpand(ISD, LT.second)) { 698 // If the operation is custom lowered, then assume that the code is twice 699 // as expensive. 700 return LT.first * 2 * OpCost; 701 } 702 703 // Else, assume that we need to scalarize this op. 704 // TODO: If one of the types get legalized by splitting, handle this 705 // similarly to what getCastInstrCost() does. 706 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 707 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements(); 708 unsigned Cost = thisT()->getArithmeticInstrCost( 709 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 710 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 711 // Return the cost of multiple scalar invocation plus the cost of 712 // inserting and extracting the values. 713 return getScalarizationOverhead(VTy, Args) + Num * Cost; 714 } 715 716 // We don't know anything about this scalar instruction. 717 return OpCost; 718 } 719 getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,int Index,VectorType * SubTp)720 unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, 721 VectorType *SubTp) { 722 723 switch (Kind) { 724 case TTI::SK_Broadcast: 725 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); 726 case TTI::SK_Select: 727 case TTI::SK_Reverse: 728 case TTI::SK_Transpose: 729 case TTI::SK_PermuteSingleSrc: 730 case TTI::SK_PermuteTwoSrc: 731 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); 732 case TTI::SK_ExtractSubvector: 733 return getExtractSubvectorOverhead(Tp, Index, 734 cast<FixedVectorType>(SubTp)); 735 case TTI::SK_InsertSubvector: 736 return getInsertSubvectorOverhead(Tp, Index, 737 cast<FixedVectorType>(SubTp)); 738 } 739 llvm_unreachable("Unknown TTI::ShuffleKind"); 740 } 741 742 unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 743 TTI::CastContextHint CCH, 744 TTI::TargetCostKind CostKind, 745 const Instruction *I = nullptr) { 746 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 747 return 0; 748 749 const TargetLoweringBase *TLI = getTLI(); 750 int ISD = TLI->InstructionOpcodeToISD(Opcode); 751 assert(ISD && "Invalid opcode"); 752 std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, Src); 753 std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(DL, Dst); 754 755 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 756 TypeSize DstSize = DstLT.second.getSizeInBits(); 757 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 758 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 759 760 switch (Opcode) { 761 default: 762 break; 763 case Instruction::Trunc: 764 // Check for NOOP conversions. 765 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 766 return 0; 767 LLVM_FALLTHROUGH; 768 case Instruction::BitCast: 769 // Bitcast between types that are legalized to the same type are free and 770 // assume int to/from ptr of the same size is also free. 771 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 772 SrcSize == DstSize) 773 return 0; 774 break; 775 case Instruction::FPExt: 776 if (I && getTLI()->isExtFree(I)) 777 return 0; 778 break; 779 case Instruction::ZExt: 780 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 781 return 0; 782 LLVM_FALLTHROUGH; 783 case Instruction::SExt: 784 if (I && getTLI()->isExtFree(I)) 785 return 0; 786 787 // If this is a zext/sext of a load, return 0 if the corresponding 788 // extending load exists on target. 789 if (CCH == TTI::CastContextHint::Normal) { 790 EVT ExtVT = EVT::getEVT(Dst); 791 EVT LoadVT = EVT::getEVT(Src); 792 unsigned LType = 793 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 794 if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 795 return 0; 796 } 797 break; 798 case Instruction::AddrSpaceCast: 799 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 800 Dst->getPointerAddressSpace())) 801 return 0; 802 break; 803 } 804 805 auto *SrcVTy = dyn_cast<VectorType>(Src); 806 auto *DstVTy = dyn_cast<VectorType>(Dst); 807 808 // If the cast is marked as legal (or promote) then assume low cost. 809 if (SrcLT.first == DstLT.first && 810 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 811 return SrcLT.first; 812 813 // Handle scalar conversions. 814 if (!SrcVTy && !DstVTy) { 815 // Just check the op cost. If the operation is legal then assume it costs 816 // 1. 817 if (!TLI->isOperationExpand(ISD, DstLT.second)) 818 return 1; 819 820 // Assume that illegal scalar instruction are expensive. 821 return 4; 822 } 823 824 // Check vector-to-vector casts. 825 if (DstVTy && SrcVTy) { 826 // If the cast is between same-sized registers, then the check is simple. 827 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 828 829 // Assume that Zext is done using AND. 830 if (Opcode == Instruction::ZExt) 831 return SrcLT.first; 832 833 // Assume that sext is done using SHL and SRA. 834 if (Opcode == Instruction::SExt) 835 return SrcLT.first * 2; 836 837 // Just check the op cost. If the operation is legal then assume it 838 // costs 839 // 1 and multiply by the type-legalization overhead. 840 if (!TLI->isOperationExpand(ISD, DstLT.second)) 841 return SrcLT.first * 1; 842 } 843 844 // If we are legalizing by splitting, query the concrete TTI for the cost 845 // of casting the original vector twice. We also need to factor in the 846 // cost of the split itself. Count that as 1, to be consistent with 847 // TLI->getTypeLegalizationCost(). 848 bool SplitSrc = 849 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 850 TargetLowering::TypeSplitVector; 851 bool SplitDst = 852 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 853 TargetLowering::TypeSplitVector; 854 if ((SplitSrc || SplitDst) && 855 cast<FixedVectorType>(SrcVTy)->getNumElements() > 1 && 856 cast<FixedVectorType>(DstVTy)->getNumElements() > 1) { 857 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 858 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 859 T *TTI = static_cast<T *>(this); 860 // If both types need to be split then the split is free. 861 unsigned SplitCost = 862 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 863 return SplitCost + 864 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 865 CostKind, I)); 866 } 867 868 // In other cases where the source or destination are illegal, assume 869 // the operation will get scalarized. 870 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 871 unsigned Cost = thisT()->getCastInstrCost( 872 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 873 874 // Return the cost of multiple scalar invocation plus the cost of 875 // inserting and extracting the values. 876 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; 877 } 878 879 // We already handled vector-to-vector and scalar-to-scalar conversions. 880 // This 881 // is where we handle bitcast between vectors and scalars. We need to assume 882 // that the conversion is scalarized in one way or another. 883 if (Opcode == Instruction::BitCast) { 884 // Illegal bitcasts are done by storing and loading from a stack slot. 885 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + 886 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); 887 } 888 889 llvm_unreachable("Unhandled cast"); 890 } 891 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)892 unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst, 893 VectorType *VecTy, unsigned Index) { 894 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 895 Index) + 896 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 897 TTI::CastContextHint::None, TTI::TCK_RecipThroughput); 898 } 899 getCFInstrCost(unsigned Opcode,TTI::TargetCostKind CostKind)900 unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) { 901 return BaseT::getCFInstrCost(Opcode, CostKind); 902 } 903 904 unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 905 CmpInst::Predicate VecPred, 906 TTI::TargetCostKind CostKind, 907 const Instruction *I = nullptr) { 908 const TargetLoweringBase *TLI = getTLI(); 909 int ISD = TLI->InstructionOpcodeToISD(Opcode); 910 assert(ISD && "Invalid opcode"); 911 912 // TODO: Handle other cost kinds. 913 if (CostKind != TTI::TCK_RecipThroughput) 914 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 915 I); 916 917 // Selects on vectors are actually vector selects. 918 if (ISD == ISD::SELECT) { 919 assert(CondTy && "CondTy must exist"); 920 if (CondTy->isVectorTy()) 921 ISD = ISD::VSELECT; 922 } 923 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); 924 925 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 926 !TLI->isOperationExpand(ISD, LT.second)) { 927 // The operation is legal. Assume it costs 1. Multiply 928 // by the type-legalization overhead. 929 return LT.first * 1; 930 } 931 932 // Otherwise, assume that the cast is scalarized. 933 // TODO: If one of the types get legalized by splitting, handle this 934 // similarly to what getCastInstrCost() does. 935 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 936 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 937 if (CondTy) 938 CondTy = CondTy->getScalarType(); 939 unsigned Cost = thisT()->getCmpSelInstrCost( 940 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 941 942 // Return the cost of multiple scalar invocation plus the cost of 943 // inserting and extracting the values. 944 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; 945 } 946 947 // Unknown scalar opcode. 948 return 1; 949 } 950 getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)951 unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { 952 std::pair<unsigned, MVT> LT = 953 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 954 955 return LT.first; 956 } 957 958 unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, 959 unsigned AddressSpace, 960 TTI::TargetCostKind CostKind, 961 const Instruction *I = nullptr) { 962 assert(!Src->isVoidTy() && "Invalid type"); 963 // Assume types, such as structs, are expensive. 964 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 965 return 4; 966 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src); 967 968 // Assuming that all loads of legal types cost 1. 969 unsigned Cost = LT.first; 970 if (CostKind != TTI::TCK_RecipThroughput) 971 return Cost; 972 973 if (Src->isVectorTy() && 974 // In practice it's not currently possible to have a change in lane 975 // length for extending loads or truncating stores so both types should 976 // have the same scalable property. 977 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), 978 LT.second.getSizeInBits())) { 979 // This is a vector load that legalizes to a larger type than the vector 980 // itself. Unless the corresponding extending load or truncating store is 981 // legal, then this will scalarize. 982 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 983 EVT MemVT = getTLI()->getValueType(DL, Src); 984 if (Opcode == Instruction::Store) 985 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 986 else 987 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 988 989 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 990 // This is a vector load/store for some illegal type that is scalarized. 991 // We must account for the cost of building or decomposing the vector. 992 Cost += getScalarizationOverhead(cast<VectorType>(Src), 993 Opcode != Instruction::Store, 994 Opcode == Instruction::Store); 995 } 996 } 997 998 return Cost; 999 } 1000 1001 unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1002 const Value *Ptr, bool VariableMask, 1003 Align Alignment, TTI::TargetCostKind CostKind, 1004 const Instruction *I = nullptr) { 1005 auto *VT = cast<FixedVectorType>(DataTy); 1006 // Assume the target does not have support for gather/scatter operations 1007 // and provide a rough estimate. 1008 // 1009 // First, compute the cost of extracting the individual addresses and the 1010 // individual memory operations. 1011 int LoadCost = 1012 VT->getNumElements() * 1013 (getVectorInstrCost( 1014 Instruction::ExtractElement, 1015 FixedVectorType::get(PointerType::get(VT->getElementType(), 0), 1016 VT->getNumElements()), 1017 -1) + 1018 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 1019 1020 // Next, compute the cost of packing the result in a vector. 1021 int PackingCost = getScalarizationOverhead(VT, Opcode != Instruction::Store, 1022 Opcode == Instruction::Store); 1023 1024 int ConditionalCost = 0; 1025 if (VariableMask) { 1026 // Compute the cost of conditionally executing the memory operations with 1027 // variable masks. This includes extracting the individual conditions, a 1028 // branches and PHIs to combine the results. 1029 // NOTE: Estimating the cost of conditionally executing the memory 1030 // operations accurately is quite difficult and the current solution 1031 // provides a very rough estimate only. 1032 ConditionalCost = 1033 VT->getNumElements() * 1034 (getVectorInstrCost( 1035 Instruction::ExtractElement, 1036 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 1037 VT->getNumElements()), 1038 -1) + 1039 getCFInstrCost(Instruction::Br, CostKind) + 1040 getCFInstrCost(Instruction::PHI, CostKind)); 1041 } 1042 1043 return LoadCost + PackingCost + ConditionalCost; 1044 } 1045 1046 unsigned getInterleavedMemoryOpCost( 1047 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1048 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1049 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1050 auto *VT = cast<FixedVectorType>(VecTy); 1051 1052 unsigned NumElts = VT->getNumElements(); 1053 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1054 1055 unsigned NumSubElts = NumElts / Factor; 1056 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1057 1058 // Firstly, the cost of load/store operation. 1059 unsigned Cost; 1060 if (UseMaskForCond || UseMaskForGaps) 1061 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1062 AddressSpace, CostKind); 1063 else 1064 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1065 CostKind); 1066 1067 // Legalize the vector type, and get the legalized and unlegalized type 1068 // sizes. 1069 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 1070 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1071 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1072 1073 // Return the ceiling of dividing A by B. 1074 auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; }; 1075 1076 // Scale the cost of the memory operation by the fraction of legalized 1077 // instructions that will actually be used. We shouldn't account for the 1078 // cost of dead instructions since they will be removed. 1079 // 1080 // E.g., An interleaved load of factor 8: 1081 // %vec = load <16 x i64>, <16 x i64>* %ptr 1082 // %v0 = shufflevector %vec, undef, <0, 8> 1083 // 1084 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1085 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1086 // type). The other loads are unused. 1087 // 1088 // We only scale the cost of loads since interleaved store groups aren't 1089 // allowed to have gaps. 1090 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 1091 // The number of loads of a legal type it will take to represent a load 1092 // of the unlegalized vector type. 1093 unsigned NumLegalInsts = ceil(VecTySize, VecTyLTSize); 1094 1095 // The number of elements of the unlegalized type that correspond to a 1096 // single legal instruction. 1097 unsigned NumEltsPerLegalInst = ceil(NumElts, NumLegalInsts); 1098 1099 // Determine which legal instructions will be used. 1100 BitVector UsedInsts(NumLegalInsts, false); 1101 for (unsigned Index : Indices) 1102 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1103 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1104 1105 // Scale the cost of the load by the fraction of legal instructions that 1106 // will be used. 1107 Cost *= UsedInsts.count() / NumLegalInsts; 1108 } 1109 1110 // Then plus the cost of interleave operation. 1111 if (Opcode == Instruction::Load) { 1112 // The interleave cost is similar to extract sub vectors' elements 1113 // from the wide vector, and insert them into sub vectors. 1114 // 1115 // E.g. An interleaved load of factor 2 (with one member of index 0): 1116 // %vec = load <8 x i32>, <8 x i32>* %ptr 1117 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1118 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1119 // <8 x i32> vector and insert them into a <4 x i32> vector. 1120 1121 assert(Indices.size() <= Factor && 1122 "Interleaved memory op has too many members"); 1123 1124 for (unsigned Index : Indices) { 1125 assert(Index < Factor && "Invalid index for interleaved memory op"); 1126 1127 // Extract elements from loaded vector for each sub vector. 1128 for (unsigned i = 0; i < NumSubElts; i++) 1129 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, 1130 Index + i * Factor); 1131 } 1132 1133 unsigned InsSubCost = 0; 1134 for (unsigned i = 0; i < NumSubElts; i++) 1135 InsSubCost += 1136 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); 1137 1138 Cost += Indices.size() * InsSubCost; 1139 } else { 1140 // The interleave cost is extract all elements from sub vectors, and 1141 // insert them into the wide vector. 1142 // 1143 // E.g. An interleaved store of factor 2: 1144 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 1145 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 1146 // The cost is estimated as extract all elements from both <4 x i32> 1147 // vectors and insert into the <8 x i32> vector. 1148 1149 unsigned ExtSubCost = 0; 1150 for (unsigned i = 0; i < NumSubElts; i++) 1151 ExtSubCost += 1152 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1153 Cost += ExtSubCost * Factor; 1154 1155 for (unsigned i = 0; i < NumElts; i++) 1156 Cost += static_cast<T *>(this) 1157 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 1158 } 1159 1160 if (!UseMaskForCond) 1161 return Cost; 1162 1163 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1164 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1165 SubVT = FixedVectorType::get(I8Type, NumSubElts); 1166 1167 // The Mask shuffling cost is extract all the elements of the Mask 1168 // and insert each of them Factor times into the wide vector: 1169 // 1170 // E.g. an interleaved group with factor 3: 1171 // %mask = icmp ult <8 x i32> %vec1, %vec2 1172 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1173 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1174 // The cost is estimated as extract all mask elements from the <8xi1> mask 1175 // vector and insert them factor times into the <24xi1> shuffled mask 1176 // vector. 1177 for (unsigned i = 0; i < NumSubElts; i++) 1178 Cost += 1179 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1180 1181 for (unsigned i = 0; i < NumElts; i++) 1182 Cost += 1183 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); 1184 1185 // The Gaps mask is invariant and created outside the loop, therefore the 1186 // cost of creating it is not accounted for here. However if we have both 1187 // a MaskForGaps and some other mask that guards the execution of the 1188 // memory access, we need to account for the cost of And-ing the two masks 1189 // inside the loop. 1190 if (UseMaskForGaps) 1191 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1192 CostKind); 1193 1194 return Cost; 1195 } 1196 1197 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1198 unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1199 TTI::TargetCostKind CostKind) { 1200 // Check for generically free intrinsics. 1201 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1202 return 0; 1203 1204 // Assume that target intrinsics are cheap. 1205 Intrinsic::ID IID = ICA.getID(); 1206 if (Function::isTargetIntrinsic(IID)) 1207 return TargetTransformInfo::TCC_Basic; 1208 1209 if (ICA.isTypeBasedOnly()) 1210 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1211 1212 Type *RetTy = ICA.getReturnType(); 1213 1214 ElementCount VF = ICA.getVectorFactor(); 1215 ElementCount RetVF = 1216 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1217 : ElementCount::getFixed(1)); 1218 assert((RetVF.isScalar() || VF.isScalar()) && 1219 "VF > 1 and RetVF is a vector type"); 1220 const IntrinsicInst *I = ICA.getInst(); 1221 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1222 FastMathFlags FMF = ICA.getFlags(); 1223 switch (IID) { 1224 default: 1225 break; 1226 1227 case Intrinsic::cttz: 1228 // FIXME: If necessary, this should go in target-specific overrides. 1229 if (VF.isScalar() && RetVF.isScalar() && 1230 getTLI()->isCheapToSpeculateCttz()) 1231 return TargetTransformInfo::TCC_Basic; 1232 break; 1233 1234 case Intrinsic::ctlz: 1235 // FIXME: If necessary, this should go in target-specific overrides. 1236 if (VF.isScalar() && RetVF.isScalar() && 1237 getTLI()->isCheapToSpeculateCtlz()) 1238 return TargetTransformInfo::TCC_Basic; 1239 break; 1240 1241 case Intrinsic::memcpy: 1242 return thisT()->getMemcpyCost(ICA.getInst()); 1243 1244 case Intrinsic::masked_scatter: { 1245 assert(VF.isScalar() && "Can't vectorize types here."); 1246 const Value *Mask = Args[3]; 1247 bool VarMask = !isa<Constant>(Mask); 1248 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1249 return thisT()->getGatherScatterOpCost(Instruction::Store, 1250 Args[0]->getType(), Args[1], 1251 VarMask, Alignment, CostKind, I); 1252 } 1253 case Intrinsic::masked_gather: { 1254 assert(VF.isScalar() && "Can't vectorize types here."); 1255 const Value *Mask = Args[2]; 1256 bool VarMask = !isa<Constant>(Mask); 1257 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1258 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1259 VarMask, Alignment, CostKind, I); 1260 } 1261 case Intrinsic::experimental_vector_extract: { 1262 // FIXME: Handle case where a scalable vector is extracted from a scalable 1263 // vector 1264 if (isa<ScalableVectorType>(RetTy)) 1265 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1266 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1267 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1268 cast<VectorType>(Args[0]->getType()), 1269 Index, cast<VectorType>(RetTy)); 1270 } 1271 case Intrinsic::experimental_vector_insert: { 1272 // FIXME: Handle case where a scalable vector is inserted into a scalable 1273 // vector 1274 if (isa<ScalableVectorType>(Args[1]->getType())) 1275 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1276 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1277 return thisT()->getShuffleCost( 1278 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), Index, 1279 cast<VectorType>(Args[1]->getType())); 1280 } 1281 case Intrinsic::vector_reduce_add: 1282 case Intrinsic::vector_reduce_mul: 1283 case Intrinsic::vector_reduce_and: 1284 case Intrinsic::vector_reduce_or: 1285 case Intrinsic::vector_reduce_xor: 1286 case Intrinsic::vector_reduce_smax: 1287 case Intrinsic::vector_reduce_smin: 1288 case Intrinsic::vector_reduce_fmax: 1289 case Intrinsic::vector_reduce_fmin: 1290 case Intrinsic::vector_reduce_umax: 1291 case Intrinsic::vector_reduce_umin: { 1292 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, 1, I); 1293 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1294 } 1295 case Intrinsic::vector_reduce_fadd: 1296 case Intrinsic::vector_reduce_fmul: { 1297 IntrinsicCostAttributes Attrs( 1298 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, 1, I); 1299 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1300 } 1301 case Intrinsic::fshl: 1302 case Intrinsic::fshr: { 1303 if (isa<ScalableVectorType>(RetTy)) 1304 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1305 const Value *X = Args[0]; 1306 const Value *Y = Args[1]; 1307 const Value *Z = Args[2]; 1308 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1309 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1310 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1311 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1312 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1313 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1314 : TTI::OP_None; 1315 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1316 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1317 unsigned Cost = 0; 1318 Cost += 1319 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1320 Cost += 1321 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1322 Cost += thisT()->getArithmeticInstrCost( 1323 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); 1324 Cost += thisT()->getArithmeticInstrCost( 1325 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); 1326 // Non-constant shift amounts requires a modulo. 1327 if (OpKindZ != TTI::OK_UniformConstantValue && 1328 OpKindZ != TTI::OK_NonUniformConstantValue) 1329 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1330 CostKind, OpKindZ, OpKindBW, 1331 OpPropsZ, OpPropsBW); 1332 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1333 if (X != Y) { 1334 Type *CondTy = RetTy->getWithNewBitWidth(1); 1335 Cost += 1336 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1337 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1338 Cost += 1339 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1340 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1341 } 1342 return Cost; 1343 } 1344 } 1345 // TODO: Handle the remaining intrinsic with scalable vector type 1346 if (isa<ScalableVectorType>(RetTy)) 1347 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1348 1349 // Assume that we need to scalarize this intrinsic. 1350 SmallVector<Type *, 4> Types; 1351 for (const Value *Op : Args) { 1352 Type *OpTy = Op->getType(); 1353 assert(VF.isScalar() || !OpTy->isVectorTy()); 1354 Types.push_back(VF.isScalar() 1355 ? OpTy 1356 : FixedVectorType::get(OpTy, VF.getKnownMinValue())); 1357 } 1358 1359 if (VF.isVector() && !RetTy->isVoidTy()) 1360 RetTy = FixedVectorType::get(RetTy, VF.getKnownMinValue()); 1361 1362 // Compute the scalarization overhead based on Args for a vector 1363 // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while 1364 // CostModel will pass a vector RetTy and VF is 1. 1365 unsigned ScalarizationCost = std::numeric_limits<unsigned>::max(); 1366 if (RetVF.isVector() || VF.isVector()) { 1367 ScalarizationCost = 0; 1368 if (!RetTy->isVoidTy()) 1369 ScalarizationCost += 1370 getScalarizationOverhead(cast<VectorType>(RetTy), true, false); 1371 ScalarizationCost += 1372 getOperandsScalarizationOverhead(Args, VF.getKnownMinValue()); 1373 } 1374 1375 IntrinsicCostAttributes Attrs(IID, RetTy, Types, FMF, ScalarizationCost, I); 1376 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1377 } 1378 1379 /// Get intrinsic cost based on argument types. 1380 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1381 /// cost of scalarizing the arguments and the return value will be computed 1382 /// based on types. getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1383 unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1384 TTI::TargetCostKind CostKind) { 1385 Intrinsic::ID IID = ICA.getID(); 1386 Type *RetTy = ICA.getReturnType(); 1387 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1388 FastMathFlags FMF = ICA.getFlags(); 1389 unsigned ScalarizationCostPassed = ICA.getScalarizationCost(); 1390 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1391 1392 VectorType *VecOpTy = nullptr; 1393 if (!Tys.empty()) { 1394 // The vector reduction operand is operand 0 except for fadd/fmul. 1395 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1396 unsigned VecTyIndex = 0; 1397 if (IID == Intrinsic::vector_reduce_fadd || 1398 IID == Intrinsic::vector_reduce_fmul) 1399 VecTyIndex = 1; 1400 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1401 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1402 } 1403 1404 // Library call cost - other than size, make it expensive. 1405 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1406 SmallVector<unsigned, 2> ISDs; 1407 switch (IID) { 1408 default: { 1409 // Assume that we need to scalarize this intrinsic. 1410 unsigned ScalarizationCost = ScalarizationCostPassed; 1411 unsigned ScalarCalls = 1; 1412 Type *ScalarRetTy = RetTy; 1413 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1414 if (!SkipScalarizationCost) 1415 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); 1416 ScalarCalls = std::max(ScalarCalls, 1417 cast<FixedVectorType>(RetVTy)->getNumElements()); 1418 ScalarRetTy = RetTy->getScalarType(); 1419 } 1420 SmallVector<Type *, 4> ScalarTys; 1421 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1422 Type *Ty = Tys[i]; 1423 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1424 if (!SkipScalarizationCost) 1425 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1426 ScalarCalls = std::max(ScalarCalls, 1427 cast<FixedVectorType>(VTy)->getNumElements()); 1428 Ty = Ty->getScalarType(); 1429 } 1430 ScalarTys.push_back(Ty); 1431 } 1432 if (ScalarCalls == 1) 1433 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1434 1435 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1436 unsigned ScalarCost = 1437 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1438 1439 return ScalarCalls * ScalarCost + ScalarizationCost; 1440 } 1441 // Look for intrinsics that can be lowered directly or turned into a scalar 1442 // intrinsic call. 1443 case Intrinsic::sqrt: 1444 ISDs.push_back(ISD::FSQRT); 1445 break; 1446 case Intrinsic::sin: 1447 ISDs.push_back(ISD::FSIN); 1448 break; 1449 case Intrinsic::cos: 1450 ISDs.push_back(ISD::FCOS); 1451 break; 1452 case Intrinsic::exp: 1453 ISDs.push_back(ISD::FEXP); 1454 break; 1455 case Intrinsic::exp2: 1456 ISDs.push_back(ISD::FEXP2); 1457 break; 1458 case Intrinsic::log: 1459 ISDs.push_back(ISD::FLOG); 1460 break; 1461 case Intrinsic::log10: 1462 ISDs.push_back(ISD::FLOG10); 1463 break; 1464 case Intrinsic::log2: 1465 ISDs.push_back(ISD::FLOG2); 1466 break; 1467 case Intrinsic::fabs: 1468 ISDs.push_back(ISD::FABS); 1469 break; 1470 case Intrinsic::canonicalize: 1471 ISDs.push_back(ISD::FCANONICALIZE); 1472 break; 1473 case Intrinsic::minnum: 1474 ISDs.push_back(ISD::FMINNUM); 1475 break; 1476 case Intrinsic::maxnum: 1477 ISDs.push_back(ISD::FMAXNUM); 1478 break; 1479 case Intrinsic::minimum: 1480 ISDs.push_back(ISD::FMINIMUM); 1481 break; 1482 case Intrinsic::maximum: 1483 ISDs.push_back(ISD::FMAXIMUM); 1484 break; 1485 case Intrinsic::copysign: 1486 ISDs.push_back(ISD::FCOPYSIGN); 1487 break; 1488 case Intrinsic::floor: 1489 ISDs.push_back(ISD::FFLOOR); 1490 break; 1491 case Intrinsic::ceil: 1492 ISDs.push_back(ISD::FCEIL); 1493 break; 1494 case Intrinsic::trunc: 1495 ISDs.push_back(ISD::FTRUNC); 1496 break; 1497 case Intrinsic::nearbyint: 1498 ISDs.push_back(ISD::FNEARBYINT); 1499 break; 1500 case Intrinsic::rint: 1501 ISDs.push_back(ISD::FRINT); 1502 break; 1503 case Intrinsic::round: 1504 ISDs.push_back(ISD::FROUND); 1505 break; 1506 case Intrinsic::roundeven: 1507 ISDs.push_back(ISD::FROUNDEVEN); 1508 break; 1509 case Intrinsic::pow: 1510 ISDs.push_back(ISD::FPOW); 1511 break; 1512 case Intrinsic::fma: 1513 ISDs.push_back(ISD::FMA); 1514 break; 1515 case Intrinsic::fmuladd: 1516 ISDs.push_back(ISD::FMA); 1517 break; 1518 case Intrinsic::experimental_constrained_fmuladd: 1519 ISDs.push_back(ISD::STRICT_FMA); 1520 break; 1521 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1522 case Intrinsic::lifetime_start: 1523 case Intrinsic::lifetime_end: 1524 case Intrinsic::sideeffect: 1525 case Intrinsic::pseudoprobe: 1526 return 0; 1527 case Intrinsic::masked_store: { 1528 Type *Ty = Tys[0]; 1529 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1530 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1531 CostKind); 1532 } 1533 case Intrinsic::masked_load: { 1534 Type *Ty = RetTy; 1535 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1536 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1537 CostKind); 1538 } 1539 case Intrinsic::vector_reduce_add: 1540 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1541 /*IsPairwiseForm=*/false, 1542 CostKind); 1543 case Intrinsic::vector_reduce_mul: 1544 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1545 /*IsPairwiseForm=*/false, 1546 CostKind); 1547 case Intrinsic::vector_reduce_and: 1548 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1549 /*IsPairwiseForm=*/false, 1550 CostKind); 1551 case Intrinsic::vector_reduce_or: 1552 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, 1553 /*IsPairwiseForm=*/false, 1554 CostKind); 1555 case Intrinsic::vector_reduce_xor: 1556 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1557 /*IsPairwiseForm=*/false, 1558 CostKind); 1559 case Intrinsic::vector_reduce_fadd: 1560 // FIXME: Add new flag for cost of strict reductions. 1561 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 1562 /*IsPairwiseForm=*/false, 1563 CostKind); 1564 case Intrinsic::vector_reduce_fmul: 1565 // FIXME: Add new flag for cost of strict reductions. 1566 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 1567 /*IsPairwiseForm=*/false, 1568 CostKind); 1569 case Intrinsic::vector_reduce_smax: 1570 case Intrinsic::vector_reduce_smin: 1571 case Intrinsic::vector_reduce_fmax: 1572 case Intrinsic::vector_reduce_fmin: 1573 return thisT()->getMinMaxReductionCost( 1574 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1575 /*IsPairwiseForm=*/false, 1576 /*IsUnsigned=*/false, CostKind); 1577 case Intrinsic::vector_reduce_umax: 1578 case Intrinsic::vector_reduce_umin: 1579 return thisT()->getMinMaxReductionCost( 1580 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1581 /*IsPairwiseForm=*/false, 1582 /*IsUnsigned=*/true, CostKind); 1583 case Intrinsic::abs: 1584 case Intrinsic::smax: 1585 case Intrinsic::smin: 1586 case Intrinsic::umax: 1587 case Intrinsic::umin: { 1588 // abs(X) = select(icmp(X,0),X,sub(0,X)) 1589 // minmax(X,Y) = select(icmp(X,Y),X,Y) 1590 Type *CondTy = RetTy->getWithNewBitWidth(1); 1591 unsigned Cost = 0; 1592 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. 1593 Cost += 1594 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1595 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1596 Cost += 1597 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1598 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1599 // TODO: Should we add an OperandValueProperties::OP_Zero property? 1600 if (IID == Intrinsic::abs) 1601 Cost += thisT()->getArithmeticInstrCost( 1602 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); 1603 return Cost; 1604 } 1605 case Intrinsic::sadd_sat: 1606 case Intrinsic::ssub_sat: { 1607 Type *CondTy = RetTy->getWithNewBitWidth(1); 1608 1609 Type *OpTy = StructType::create({RetTy, CondTy}); 1610 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1611 ? Intrinsic::sadd_with_overflow 1612 : Intrinsic::ssub_with_overflow; 1613 1614 // SatMax -> Overflow && SumDiff < 0 1615 // SatMin -> Overflow && SumDiff >= 0 1616 unsigned Cost = 0; 1617 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1618 ScalarizationCostPassed); 1619 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1620 Cost += 1621 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1622 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1623 Cost += 2 * thisT()->getCmpSelInstrCost( 1624 BinaryOperator::Select, RetTy, CondTy, 1625 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1626 return Cost; 1627 } 1628 case Intrinsic::uadd_sat: 1629 case Intrinsic::usub_sat: { 1630 Type *CondTy = RetTy->getWithNewBitWidth(1); 1631 1632 Type *OpTy = StructType::create({RetTy, CondTy}); 1633 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1634 ? Intrinsic::uadd_with_overflow 1635 : Intrinsic::usub_with_overflow; 1636 1637 unsigned Cost = 0; 1638 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1639 ScalarizationCostPassed); 1640 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1641 Cost += 1642 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1643 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1644 return Cost; 1645 } 1646 case Intrinsic::smul_fix: 1647 case Intrinsic::umul_fix: { 1648 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1649 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1650 1651 unsigned ExtOp = 1652 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1653 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1654 1655 unsigned Cost = 0; 1656 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 1657 Cost += 1658 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1659 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 1660 CCH, CostKind); 1661 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 1662 CostKind, TTI::OK_AnyValue, 1663 TTI::OK_UniformConstantValue); 1664 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 1665 TTI::OK_AnyValue, 1666 TTI::OK_UniformConstantValue); 1667 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 1668 return Cost; 1669 } 1670 case Intrinsic::sadd_with_overflow: 1671 case Intrinsic::ssub_with_overflow: { 1672 Type *SumTy = RetTy->getContainedType(0); 1673 Type *OverflowTy = RetTy->getContainedType(1); 1674 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1675 ? BinaryOperator::Add 1676 : BinaryOperator::Sub; 1677 1678 // LHSSign -> LHS >= 0 1679 // RHSSign -> RHS >= 0 1680 // SumSign -> Sum >= 0 1681 // 1682 // Add: 1683 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) 1684 // Sub: 1685 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) 1686 unsigned Cost = 0; 1687 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1688 Cost += 3 * thisT()->getCmpSelInstrCost( 1689 Instruction::ICmp, SumTy, OverflowTy, 1690 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1691 Cost += 2 * thisT()->getCmpSelInstrCost( 1692 Instruction::Select, OverflowTy, OverflowTy, 1693 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1694 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy, 1695 CostKind); 1696 return Cost; 1697 } 1698 case Intrinsic::uadd_with_overflow: 1699 case Intrinsic::usub_with_overflow: { 1700 Type *SumTy = RetTy->getContainedType(0); 1701 Type *OverflowTy = RetTy->getContainedType(1); 1702 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1703 ? BinaryOperator::Add 1704 : BinaryOperator::Sub; 1705 1706 unsigned Cost = 0; 1707 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1708 Cost += 1709 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 1710 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1711 return Cost; 1712 } 1713 case Intrinsic::smul_with_overflow: 1714 case Intrinsic::umul_with_overflow: { 1715 Type *MulTy = RetTy->getContainedType(0); 1716 Type *OverflowTy = RetTy->getContainedType(1); 1717 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1718 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1719 1720 unsigned ExtOp = 1721 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1722 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1723 1724 unsigned Cost = 0; 1725 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 1726 Cost += 1727 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1728 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 1729 CCH, CostKind); 1730 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, 1731 CostKind, TTI::OK_AnyValue, 1732 TTI::OK_UniformConstantValue); 1733 1734 if (IID == Intrinsic::smul_with_overflow) 1735 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 1736 CostKind, TTI::OK_AnyValue, 1737 TTI::OK_UniformConstantValue); 1738 1739 Cost += 1740 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, 1741 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1742 return Cost; 1743 } 1744 case Intrinsic::ctpop: 1745 ISDs.push_back(ISD::CTPOP); 1746 // In case of legalization use TCC_Expensive. This is cheaper than a 1747 // library call but still not a cheap instruction. 1748 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1749 break; 1750 case Intrinsic::ctlz: 1751 ISDs.push_back(ISD::CTLZ); 1752 break; 1753 case Intrinsic::cttz: 1754 ISDs.push_back(ISD::CTTZ); 1755 break; 1756 case Intrinsic::bswap: 1757 ISDs.push_back(ISD::BSWAP); 1758 break; 1759 case Intrinsic::bitreverse: 1760 ISDs.push_back(ISD::BITREVERSE); 1761 break; 1762 } 1763 1764 const TargetLoweringBase *TLI = getTLI(); 1765 std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy); 1766 1767 SmallVector<unsigned, 2> LegalCost; 1768 SmallVector<unsigned, 2> CustomCost; 1769 for (unsigned ISD : ISDs) { 1770 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1771 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 1772 TLI->isFAbsFree(LT.second)) { 1773 return 0; 1774 } 1775 1776 // The operation is legal. Assume it costs 1. 1777 // If the type is split to multiple registers, assume that there is some 1778 // overhead to this. 1779 // TODO: Once we have extract/insert subvector cost we need to use them. 1780 if (LT.first > 1) 1781 LegalCost.push_back(LT.first * 2); 1782 else 1783 LegalCost.push_back(LT.first * 1); 1784 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1785 // If the operation is custom lowered then assume 1786 // that the code is twice as expensive. 1787 CustomCost.push_back(LT.first * 2); 1788 } 1789 } 1790 1791 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1792 if (MinLegalCostI != LegalCost.end()) 1793 return *MinLegalCostI; 1794 1795 auto MinCustomCostI = 1796 std::min_element(CustomCost.begin(), CustomCost.end()); 1797 if (MinCustomCostI != CustomCost.end()) 1798 return *MinCustomCostI; 1799 1800 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1801 // point mul followed by an add. 1802 if (IID == Intrinsic::fmuladd) 1803 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 1804 CostKind) + 1805 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 1806 CostKind); 1807 if (IID == Intrinsic::experimental_constrained_fmuladd) { 1808 IntrinsicCostAttributes FMulAttrs( 1809 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 1810 IntrinsicCostAttributes FAddAttrs( 1811 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 1812 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 1813 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 1814 } 1815 1816 // Else, assume that we need to scalarize this intrinsic. For math builtins 1817 // this will emit a costly libcall, adding call overhead and spills. Make it 1818 // very expensive. 1819 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1820 unsigned ScalarizationCost = SkipScalarizationCost ? 1821 ScalarizationCostPassed : getScalarizationOverhead(RetVTy, true, false); 1822 1823 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 1824 SmallVector<Type *, 4> ScalarTys; 1825 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1826 Type *Ty = Tys[i]; 1827 if (Ty->isVectorTy()) 1828 Ty = Ty->getScalarType(); 1829 ScalarTys.push_back(Ty); 1830 } 1831 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 1832 unsigned ScalarCost = thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1833 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1834 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 1835 if (!ICA.skipScalarizationCost()) 1836 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1837 ScalarCalls = std::max(ScalarCalls, 1838 cast<FixedVectorType>(VTy)->getNumElements()); 1839 } 1840 } 1841 return ScalarCalls * ScalarCost + ScalarizationCost; 1842 } 1843 1844 // This is going to be turned into a library call, make it expensive. 1845 return SingleCallCost; 1846 } 1847 1848 /// Compute a cost of the given call instruction. 1849 /// 1850 /// Compute the cost of calling function F with return type RetTy and 1851 /// argument types Tys. F might be nullptr, in this case the cost of an 1852 /// arbitrary call with the specified signature will be returned. 1853 /// This is used, for instance, when we estimate call of a vector 1854 /// counterpart of the given function. 1855 /// \param F Called function, might be nullptr. 1856 /// \param RetTy Return value types. 1857 /// \param Tys Argument types. 1858 /// \returns The cost of Call instruction. 1859 unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, 1860 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) { 1861 return 10; 1862 } 1863 getNumberOfParts(Type * Tp)1864 unsigned getNumberOfParts(Type *Tp) { 1865 std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Tp); 1866 return LT.first; 1867 } 1868 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1869 unsigned getAddressComputationCost(Type *Ty, ScalarEvolution *, 1870 const SCEV *) { 1871 return 0; 1872 } 1873 1874 /// Try to calculate arithmetic and shuffle op costs for reduction operations. 1875 /// We're assuming that reduction operation are performing the following way: 1876 /// 1. Non-pairwise reduction 1877 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1878 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1879 /// \----------------v-------------/ \----------v------------/ 1880 /// n/2 elements n/2 elements 1881 /// %red1 = op <n x t> %val, <n x t> val1 1882 /// After this operation we have a vector %red1 where only the first n/2 1883 /// elements are meaningful, the second n/2 elements are undefined and can be 1884 /// dropped. All other operations are actually working with the vector of 1885 /// length n/2, not n, though the real vector length is still n. 1886 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 1887 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 1888 /// \----------------v-------------/ \----------v------------/ 1889 /// n/4 elements 3*n/4 elements 1890 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 1891 /// length n/2, the resulting vector has length n/4 etc. 1892 /// 2. Pairwise reduction: 1893 /// Everything is the same except for an additional shuffle operation which 1894 /// is used to produce operands for pairwise kind of reductions. 1895 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1896 /// <n x i32> <i32 0, i32 2, ..., i32 n-2, i32 undef, ..., i32 undef> 1897 /// \-------------v----------/ \----------v------------/ 1898 /// n/2 elements n/2 elements 1899 /// %val2 = shufflevector<n x t> %val, <n x t> %undef, 1900 /// <n x i32> <i32 1, i32 3, ..., i32 n-1, i32 undef, ..., i32 undef> 1901 /// \-------------v----------/ \----------v------------/ 1902 /// n/2 elements n/2 elements 1903 /// %red1 = op <n x t> %val1, <n x t> val2 1904 /// Again, the operation is performed on <n x t> vector, but the resulting 1905 /// vector %red1 is <n/2 x t> vector. 1906 /// 1907 /// The cost model should take into account that the actual length of the 1908 /// vector is reduced on each iteration. getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,bool IsPairwise,TTI::TargetCostKind CostKind)1909 unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 1910 bool IsPairwise, 1911 TTI::TargetCostKind CostKind) { 1912 Type *ScalarTy = Ty->getElementType(); 1913 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 1914 unsigned NumReduxLevels = Log2_32(NumVecElts); 1915 unsigned ArithCost = 0; 1916 unsigned ShuffleCost = 0; 1917 std::pair<unsigned, MVT> LT = 1918 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 1919 unsigned LongVectorCount = 0; 1920 unsigned MVTLen = 1921 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1922 while (NumVecElts > MVTLen) { 1923 NumVecElts /= 2; 1924 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 1925 // Assume the pairwise shuffles add a cost. 1926 ShuffleCost += 1927 (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1928 Ty, NumVecElts, SubTy); 1929 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 1930 Ty = SubTy; 1931 ++LongVectorCount; 1932 } 1933 1934 NumReduxLevels -= LongVectorCount; 1935 1936 // The minimal length of the vector is limited by the real length of vector 1937 // operations performed on the current platform. That's why several final 1938 // reduction operations are performed on the vectors with the same 1939 // architecture-dependent length. 1940 1941 // Non pairwise reductions need one shuffle per reduction level. Pairwise 1942 // reductions need two shuffles on every level, but the last one. On that 1943 // level one of the shuffles is <0, u, u, ...> which is identity. 1944 unsigned NumShuffles = NumReduxLevels; 1945 if (IsPairwise && NumReduxLevels >= 1) 1946 NumShuffles += NumReduxLevels - 1; 1947 ShuffleCost += NumShuffles * 1948 thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); 1949 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); 1950 return ShuffleCost + ArithCost + 1951 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 1952 } 1953 1954 /// Try to calculate op costs for min/max reduction operations. 1955 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsPairwise,bool IsUnsigned,TTI::TargetCostKind CostKind)1956 unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 1957 bool IsPairwise, bool IsUnsigned, 1958 TTI::TargetCostKind CostKind) { 1959 Type *ScalarTy = Ty->getElementType(); 1960 Type *ScalarCondTy = CondTy->getElementType(); 1961 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 1962 unsigned NumReduxLevels = Log2_32(NumVecElts); 1963 unsigned CmpOpcode; 1964 if (Ty->isFPOrFPVectorTy()) { 1965 CmpOpcode = Instruction::FCmp; 1966 } else { 1967 assert(Ty->isIntOrIntVectorTy() && 1968 "expecting floating point or integer type for min/max reduction"); 1969 CmpOpcode = Instruction::ICmp; 1970 } 1971 unsigned MinMaxCost = 0; 1972 unsigned ShuffleCost = 0; 1973 std::pair<unsigned, MVT> LT = 1974 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 1975 unsigned LongVectorCount = 0; 1976 unsigned MVTLen = 1977 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 1978 while (NumVecElts > MVTLen) { 1979 NumVecElts /= 2; 1980 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 1981 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); 1982 1983 // Assume the pairwise shuffles add a cost. 1984 ShuffleCost += 1985 (IsPairwise + 1) * thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1986 Ty, NumVecElts, SubTy); 1987 MinMaxCost += 1988 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, 1989 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 1990 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 1991 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1992 Ty = SubTy; 1993 ++LongVectorCount; 1994 } 1995 1996 NumReduxLevels -= LongVectorCount; 1997 1998 // The minimal length of the vector is limited by the real length of vector 1999 // operations performed on the current platform. That's why several final 2000 // reduction opertions are perfomed on the vectors with the same 2001 // architecture-dependent length. 2002 2003 // Non pairwise reductions need one shuffle per reduction level. Pairwise 2004 // reductions need two shuffles on every level, but the last one. On that 2005 // level one of the shuffles is <0, u, u, ...> which is identity. 2006 unsigned NumShuffles = NumReduxLevels; 2007 if (IsPairwise && NumReduxLevels >= 1) 2008 NumShuffles += NumReduxLevels - 1; 2009 ShuffleCost += NumShuffles * 2010 thisT()->getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, Ty); 2011 MinMaxCost += 2012 NumReduxLevels * 2013 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, 2014 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2015 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 2016 CmpInst::BAD_ICMP_PREDICATE, CostKind)); 2017 // The last min/max should be in vector registers and we counted it above. 2018 // So just need a single extractelement. 2019 return ShuffleCost + MinMaxCost + 2020 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2021 } 2022 getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2023 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 2024 Type *ResTy, VectorType *Ty, 2025 TTI::TargetCostKind CostKind) { 2026 // Without any native support, this is equivalent to the cost of 2027 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) 2028 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2029 unsigned RedCost = thisT()->getArithmeticReductionCost( 2030 Instruction::Add, ExtTy, false, CostKind); 2031 unsigned MulCost = 0; 2032 unsigned ExtCost = thisT()->getCastInstrCost( 2033 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2034 TTI::CastContextHint::None, CostKind); 2035 if (IsMLA) { 2036 MulCost = 2037 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2038 ExtCost *= 2; 2039 } 2040 2041 return RedCost + MulCost + ExtCost; 2042 } 2043 getVectorSplitCost()2044 unsigned getVectorSplitCost() { return 1; } 2045 2046 /// @} 2047 }; 2048 2049 /// Concrete BasicTTIImpl that can be used if no further customization 2050 /// is needed. 2051 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2052 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2053 2054 friend class BasicTTIImplBase<BasicTTIImpl>; 2055 2056 const TargetSubtargetInfo *ST; 2057 const TargetLoweringBase *TLI; 2058 getST()2059 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2060 const TargetLoweringBase *getTLI() const { return TLI; } 2061 2062 public: 2063 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2064 }; 2065 2066 } // end namespace llvm 2067 2068 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2069