1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/CodeGen/ISDOpcodes.h" 29 #include "llvm/CodeGen/TargetLowering.h" 30 #include "llvm/CodeGen/TargetSubtargetInfo.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Operator.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MachineValueType.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Target/TargetMachine.h" 50 #include <algorithm> 51 #include <cassert> 52 #include <cstdint> 53 #include <limits> 54 #include <utility> 55 56 namespace llvm { 57 58 class Function; 59 class GlobalValue; 60 class LLVMContext; 61 class ScalarEvolution; 62 class SCEV; 63 class TargetMachine; 64 65 extern cl::opt<unsigned> PartialUnrollingThreshold; 66 67 /// Base class which can be used to help build a TTI implementation. 68 /// 69 /// This class provides as much implementation of the TTI interface as is 70 /// possible using the target independent parts of the code generator. 71 /// 72 /// In order to subclass it, your class must implement a getST() method to 73 /// return the subtarget, and a getTLI() method to return the target lowering. 74 /// We need these methods implemented in the derived class so that this class 75 /// doesn't have to duplicate storage for them. 76 template <typename T> 77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 78 private: 79 using BaseT = TargetTransformInfoImplCRTPBase<T>; 80 using TTI = TargetTransformInfo; 81 82 /// Helper function to access this as a T. thisT()83 T *thisT() { return static_cast<T *>(this); } 84 85 /// Estimate a cost of Broadcast as an extract and sequence of insert 86 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy)87 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { 88 InstructionCost Cost = 0; 89 // Broadcast cost is equal to the cost of extracting the zero'th element 90 // plus the cost of inserting it into every element of the result vector. 91 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); 92 93 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 94 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 95 } 96 return Cost; 97 } 98 99 /// Estimate a cost of shuffle as a sequence of extract and insert 100 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy)101 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { 102 InstructionCost Cost = 0; 103 // Shuffle cost is equal to the cost of extracting element from its argument 104 // plus the cost of inserting them onto the result vector. 105 106 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 107 // index 0 of first vector, index 1 of second vector,index 2 of first 108 // vector and finally index 3 of second vector and insert them at index 109 // <0,1,2,3> of result vector. 110 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 111 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 112 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); 113 } 114 return Cost; 115 } 116 117 /// Estimate a cost of subvector extraction as a sequence of extract and 118 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)119 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, 120 FixedVectorType *SubVTy) { 121 assert(VTy && SubVTy && 122 "Can only extract subvectors from vectors"); 123 int NumSubElts = SubVTy->getNumElements(); 124 assert((!isa<FixedVectorType>(VTy) || 125 (Index + NumSubElts) <= 126 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 127 "SK_ExtractSubvector index out of range"); 128 129 InstructionCost Cost = 0; 130 // Subvector extraction cost is equal to the cost of extracting element from 131 // the source type plus the cost of inserting them into the result vector 132 // type. 133 for (int i = 0; i != NumSubElts; ++i) { 134 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 135 i + Index); 136 Cost += 137 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); 138 } 139 return Cost; 140 } 141 142 /// Estimate a cost of subvector insertion as a sequence of extract and 143 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)144 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, 145 FixedVectorType *SubVTy) { 146 assert(VTy && SubVTy && 147 "Can only insert subvectors into vectors"); 148 int NumSubElts = SubVTy->getNumElements(); 149 assert((!isa<FixedVectorType>(VTy) || 150 (Index + NumSubElts) <= 151 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 152 "SK_InsertSubvector index out of range"); 153 154 InstructionCost Cost = 0; 155 // Subvector insertion cost is equal to the cost of extracting element from 156 // the source type plus the cost of inserting them into the result vector 157 // type. 158 for (int i = 0; i != NumSubElts; ++i) { 159 Cost += 160 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); 161 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 162 i + Index); 163 } 164 return Cost; 165 } 166 167 /// Local query method delegates up to T which *must* implement this! getST()168 const TargetSubtargetInfo *getST() const { 169 return static_cast<const T *>(this)->getST(); 170 } 171 172 /// Local query method delegates up to T which *must* implement this! getTLI()173 const TargetLoweringBase *getTLI() const { 174 return static_cast<const T *>(this)->getTLI(); 175 } 176 getISDIndexedMode(TTI::MemIndexedMode M)177 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 178 switch (M) { 179 case TTI::MIM_Unindexed: 180 return ISD::UNINDEXED; 181 case TTI::MIM_PreInc: 182 return ISD::PRE_INC; 183 case TTI::MIM_PreDec: 184 return ISD::PRE_DEC; 185 case TTI::MIM_PostInc: 186 return ISD::POST_INC; 187 case TTI::MIM_PostDec: 188 return ISD::POST_DEC; 189 } 190 llvm_unreachable("Unexpected MemIndexedMode"); 191 } 192 getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)193 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 194 Align Alignment, 195 bool VariableMask, 196 bool IsGatherScatter, 197 TTI::TargetCostKind CostKind) { 198 auto *VT = cast<FixedVectorType>(DataTy); 199 // Assume the target does not have support for gather/scatter operations 200 // and provide a rough estimate. 201 // 202 // First, compute the cost of the individual memory operations. 203 InstructionCost AddrExtractCost = 204 IsGatherScatter 205 ? getVectorInstrCost(Instruction::ExtractElement, 206 FixedVectorType::get( 207 PointerType::get(VT->getElementType(), 0), 208 VT->getNumElements()), 209 -1) 210 : 0; 211 InstructionCost LoadCost = 212 VT->getNumElements() * 213 (AddrExtractCost + 214 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 215 216 // Next, compute the cost of packing the result in a vector. 217 InstructionCost PackingCost = getScalarizationOverhead( 218 VT, Opcode != Instruction::Store, Opcode == Instruction::Store); 219 220 InstructionCost ConditionalCost = 0; 221 if (VariableMask) { 222 // Compute the cost of conditionally executing the memory operations with 223 // variable masks. This includes extracting the individual conditions, a 224 // branches and PHIs to combine the results. 225 // NOTE: Estimating the cost of conditionally executing the memory 226 // operations accurately is quite difficult and the current solution 227 // provides a very rough estimate only. 228 ConditionalCost = 229 VT->getNumElements() * 230 (getVectorInstrCost( 231 Instruction::ExtractElement, 232 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 233 VT->getNumElements()), 234 -1) + 235 getCFInstrCost(Instruction::Br, CostKind) + 236 getCFInstrCost(Instruction::PHI, CostKind)); 237 } 238 239 return LoadCost + PackingCost + ConditionalCost; 240 } 241 242 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)243 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 244 : BaseT(DL) {} 245 virtual ~BasicTTIImplBase() = default; 246 247 using TargetTransformInfoImplBase::DL; 248 249 public: 250 /// \name Scalar TTI Implementations 251 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,bool * Fast)252 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 253 unsigned AddressSpace, Align Alignment, 254 bool *Fast) const { 255 EVT E = EVT::getIntegerVT(Context, BitWidth); 256 return getTLI()->allowsMisalignedMemoryAccesses( 257 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 258 } 259 hasBranchDivergence()260 bool hasBranchDivergence() { return false; } 261 useGPUDivergenceAnalysis()262 bool useGPUDivergenceAnalysis() { return false; } 263 isSourceOfDivergence(const Value * V)264 bool isSourceOfDivergence(const Value *V) { return false; } 265 isAlwaysUniform(const Value * V)266 bool isAlwaysUniform(const Value *V) { return false; } 267 getFlatAddressSpace()268 unsigned getFlatAddressSpace() { 269 // Return an invalid address space. 270 return -1; 271 } 272 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)273 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 274 Intrinsic::ID IID) const { 275 return false; 276 } 277 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)278 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 279 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 280 } 281 getAssumedAddrSpace(const Value * V)282 unsigned getAssumedAddrSpace(const Value *V) const { 283 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 284 } 285 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)286 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 287 Value *NewV) const { 288 return nullptr; 289 } 290 isLegalAddImmediate(int64_t imm)291 bool isLegalAddImmediate(int64_t imm) { 292 return getTLI()->isLegalAddImmediate(imm); 293 } 294 isLegalICmpImmediate(int64_t imm)295 bool isLegalICmpImmediate(int64_t imm) { 296 return getTLI()->isLegalICmpImmediate(imm); 297 } 298 299 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 300 bool HasBaseReg, int64_t Scale, 301 unsigned AddrSpace, Instruction *I = nullptr) { 302 TargetLoweringBase::AddrMode AM; 303 AM.BaseGV = BaseGV; 304 AM.BaseOffs = BaseOffset; 305 AM.HasBaseReg = HasBaseReg; 306 AM.Scale = Scale; 307 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 308 } 309 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)310 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 311 const DataLayout &DL) const { 312 EVT VT = getTLI()->getValueType(DL, Ty); 313 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 314 } 315 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)316 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 317 const DataLayout &DL) const { 318 EVT VT = getTLI()->getValueType(DL, Ty); 319 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 320 } 321 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)322 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 323 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 324 } 325 isNumRegsMajorCostOfLSR()326 bool isNumRegsMajorCostOfLSR() { 327 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 328 } 329 isProfitableLSRChainElement(Instruction * I)330 bool isProfitableLSRChainElement(Instruction *I) { 331 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 332 } 333 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)334 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 335 int64_t BaseOffset, bool HasBaseReg, 336 int64_t Scale, unsigned AddrSpace) { 337 TargetLoweringBase::AddrMode AM; 338 AM.BaseGV = BaseGV; 339 AM.BaseOffs = BaseOffset; 340 AM.HasBaseReg = HasBaseReg; 341 AM.Scale = Scale; 342 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 343 } 344 isTruncateFree(Type * Ty1,Type * Ty2)345 bool isTruncateFree(Type *Ty1, Type *Ty2) { 346 return getTLI()->isTruncateFree(Ty1, Ty2); 347 } 348 isProfitableToHoist(Instruction * I)349 bool isProfitableToHoist(Instruction *I) { 350 return getTLI()->isProfitableToHoist(I); 351 } 352 useAA()353 bool useAA() const { return getST()->useAA(); } 354 isTypeLegal(Type * Ty)355 bool isTypeLegal(Type *Ty) { 356 EVT VT = getTLI()->getValueType(DL, Ty); 357 return getTLI()->isTypeLegal(VT); 358 } 359 getRegUsageForType(Type * Ty)360 InstructionCost getRegUsageForType(Type *Ty) { 361 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; 362 assert(Val >= 0 && "Negative cost!"); 363 return Val; 364 } 365 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands,TTI::TargetCostKind CostKind)366 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 367 ArrayRef<const Value *> Operands, 368 TTI::TargetCostKind CostKind) { 369 return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind); 370 } 371 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)372 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 373 unsigned &JumpTableSize, 374 ProfileSummaryInfo *PSI, 375 BlockFrequencyInfo *BFI) { 376 /// Try to find the estimated number of clusters. Note that the number of 377 /// clusters identified in this function could be different from the actual 378 /// numbers found in lowering. This function ignore switches that are 379 /// lowered with a mix of jump table / bit test / BTree. This function was 380 /// initially intended to be used when estimating the cost of switch in 381 /// inline cost heuristic, but it's a generic cost model to be used in other 382 /// places (e.g., in loop unrolling). 383 unsigned N = SI.getNumCases(); 384 const TargetLoweringBase *TLI = getTLI(); 385 const DataLayout &DL = this->getDataLayout(); 386 387 JumpTableSize = 0; 388 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 389 390 // Early exit if both a jump table and bit test are not allowed. 391 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 392 return N; 393 394 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 395 APInt MinCaseVal = MaxCaseVal; 396 for (auto CI : SI.cases()) { 397 const APInt &CaseVal = CI.getCaseValue()->getValue(); 398 if (CaseVal.sgt(MaxCaseVal)) 399 MaxCaseVal = CaseVal; 400 if (CaseVal.slt(MinCaseVal)) 401 MinCaseVal = CaseVal; 402 } 403 404 // Check if suitable for a bit test 405 if (N <= DL.getIndexSizeInBits(0u)) { 406 SmallPtrSet<const BasicBlock *, 4> Dests; 407 for (auto I : SI.cases()) 408 Dests.insert(I.getCaseSuccessor()); 409 410 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 411 DL)) 412 return 1; 413 } 414 415 // Check if suitable for a jump table. 416 if (IsJTAllowed) { 417 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 418 return N; 419 uint64_t Range = 420 (MaxCaseVal - MinCaseVal) 421 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 422 // Check whether a range of clusters is dense enough for a jump table 423 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 424 JumpTableSize = Range; 425 return 1; 426 } 427 } 428 return N; 429 } 430 shouldBuildLookupTables()431 bool shouldBuildLookupTables() { 432 const TargetLoweringBase *TLI = getTLI(); 433 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 434 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 435 } 436 shouldBuildRelLookupTables()437 bool shouldBuildRelLookupTables() const { 438 const TargetMachine &TM = getTLI()->getTargetMachine(); 439 // If non-PIC mode, do not generate a relative lookup table. 440 if (!TM.isPositionIndependent()) 441 return false; 442 443 /// Relative lookup table entries consist of 32-bit offsets. 444 /// Do not generate relative lookup tables for large code models 445 /// in 64-bit achitectures where 32-bit offsets might not be enough. 446 if (TM.getCodeModel() == CodeModel::Medium || 447 TM.getCodeModel() == CodeModel::Large) 448 return false; 449 450 Triple TargetTriple = TM.getTargetTriple(); 451 if (!TargetTriple.isArch64Bit()) 452 return false; 453 454 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 455 // there. 456 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 457 return false; 458 459 return true; 460 } 461 haveFastSqrt(Type * Ty)462 bool haveFastSqrt(Type *Ty) { 463 const TargetLoweringBase *TLI = getTLI(); 464 EVT VT = TLI->getValueType(DL, Ty); 465 return TLI->isTypeLegal(VT) && 466 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 467 } 468 isFCmpOrdCheaperThanFCmpZero(Type * Ty)469 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 470 return true; 471 } 472 getFPOpCost(Type * Ty)473 InstructionCost getFPOpCost(Type *Ty) { 474 // Check whether FADD is available, as a proxy for floating-point in 475 // general. 476 const TargetLoweringBase *TLI = getTLI(); 477 EVT VT = TLI->getValueType(DL, Ty); 478 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 479 return TargetTransformInfo::TCC_Basic; 480 return TargetTransformInfo::TCC_Expensive; 481 } 482 getInliningThresholdMultiplier()483 unsigned getInliningThresholdMultiplier() { return 1; } adjustInliningThreshold(const CallBase * CB)484 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 485 getInlinerVectorBonusPercent()486 int getInlinerVectorBonusPercent() { return 150; } 487 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP,OptimizationRemarkEmitter * ORE)488 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 489 TTI::UnrollingPreferences &UP, 490 OptimizationRemarkEmitter *ORE) { 491 // This unrolling functionality is target independent, but to provide some 492 // motivation for its intended use, for x86: 493 494 // According to the Intel 64 and IA-32 Architectures Optimization Reference 495 // Manual, Intel Core models and later have a loop stream detector (and 496 // associated uop queue) that can benefit from partial unrolling. 497 // The relevant requirements are: 498 // - The loop must have no more than 4 (8 for Nehalem and later) branches 499 // taken, and none of them may be calls. 500 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 501 502 // According to the Software Optimization Guide for AMD Family 15h 503 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 504 // and loop buffer which can benefit from partial unrolling. 505 // The relevant requirements are: 506 // - The loop must have fewer than 16 branches 507 // - The loop must have less than 40 uops in all executed loop branches 508 509 // The number of taken branches in a loop is hard to estimate here, and 510 // benchmarking has revealed that it is better not to be conservative when 511 // estimating the branch count. As a result, we'll ignore the branch limits 512 // until someone finds a case where it matters in practice. 513 514 unsigned MaxOps; 515 const TargetSubtargetInfo *ST = getST(); 516 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 517 MaxOps = PartialUnrollingThreshold; 518 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 519 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 520 else 521 return; 522 523 // Scan the loop: don't unroll loops with calls. 524 for (BasicBlock *BB : L->blocks()) { 525 for (Instruction &I : *BB) { 526 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 527 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 528 if (!thisT()->isLoweredToCall(F)) 529 continue; 530 } 531 532 if (ORE) { 533 ORE->emit([&]() { 534 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 535 L->getHeader()) 536 << "advising against unrolling the loop because it " 537 "contains a " 538 << ore::NV("Call", &I); 539 }); 540 } 541 return; 542 } 543 } 544 } 545 546 // Enable runtime and partial unrolling up to the specified size. 547 // Enable using trip count upper bound to unroll loops. 548 UP.Partial = UP.Runtime = UP.UpperBound = true; 549 UP.PartialThreshold = MaxOps; 550 551 // Avoid unrolling when optimizing for size. 552 UP.OptSizeThreshold = 0; 553 UP.PartialOptSizeThreshold = 0; 554 555 // Set number of instructions optimized when "back edge" 556 // becomes "fall through" to default value of 2. 557 UP.BEInsns = 2; 558 } 559 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)560 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 561 TTI::PeelingPreferences &PP) { 562 PP.PeelCount = 0; 563 PP.AllowPeeling = true; 564 PP.AllowLoopNestsPeeling = false; 565 PP.PeelProfiledIterations = true; 566 } 567 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)568 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 569 AssumptionCache &AC, 570 TargetLibraryInfo *LibInfo, 571 HardwareLoopInfo &HWLoopInfo) { 572 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 573 } 574 preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)575 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 576 AssumptionCache &AC, TargetLibraryInfo *TLI, 577 DominatorTree *DT, 578 const LoopAccessInfo *LAI) { 579 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); 580 } 581 emitGetActiveLaneMask()582 bool emitGetActiveLaneMask() { 583 return BaseT::emitGetActiveLaneMask(); 584 } 585 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)586 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 587 IntrinsicInst &II) { 588 return BaseT::instCombineIntrinsic(IC, II); 589 } 590 simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)591 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, 592 IntrinsicInst &II, 593 APInt DemandedMask, 594 KnownBits &Known, 595 bool &KnownBitsComputed) { 596 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 597 KnownBitsComputed); 598 } 599 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)600 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 601 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 602 APInt &UndefElts2, APInt &UndefElts3, 603 std::function<void(Instruction *, unsigned, APInt, APInt &)> 604 SimplifyAndSetOp) { 605 return BaseT::simplifyDemandedVectorEltsIntrinsic( 606 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 607 SimplifyAndSetOp); 608 } 609 getInstructionLatency(const Instruction * I)610 InstructionCost getInstructionLatency(const Instruction *I) { 611 if (isa<LoadInst>(I)) 612 return getST()->getSchedModel().DefaultLoadLatency; 613 614 return BaseT::getInstructionLatency(I); 615 } 616 617 virtual Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)618 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 619 return Optional<unsigned>( 620 getST()->getCacheSize(static_cast<unsigned>(Level))); 621 } 622 623 virtual Optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)624 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 625 Optional<unsigned> TargetResult = 626 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 627 628 if (TargetResult) 629 return TargetResult; 630 631 return BaseT::getCacheAssociativity(Level); 632 } 633 getCacheLineSize()634 virtual unsigned getCacheLineSize() const { 635 return getST()->getCacheLineSize(); 636 } 637 getPrefetchDistance()638 virtual unsigned getPrefetchDistance() const { 639 return getST()->getPrefetchDistance(); 640 } 641 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)642 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 643 unsigned NumStridedMemAccesses, 644 unsigned NumPrefetches, 645 bool HasCall) const { 646 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 647 NumPrefetches, HasCall); 648 } 649 getMaxPrefetchIterationsAhead()650 virtual unsigned getMaxPrefetchIterationsAhead() const { 651 return getST()->getMaxPrefetchIterationsAhead(); 652 } 653 enableWritePrefetching()654 virtual bool enableWritePrefetching() const { 655 return getST()->enableWritePrefetching(); 656 } 657 658 /// @} 659 660 /// \name Vector TTI Implementations 661 /// @{ 662 getRegisterBitWidth(TargetTransformInfo::RegisterKind K)663 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 664 return TypeSize::getFixed(32); 665 } 666 getMaxVScale()667 Optional<unsigned> getMaxVScale() const { return None; } 668 669 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 670 /// are set if the demanded result elements need to be inserted and/or 671 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)672 InstructionCost getScalarizationOverhead(VectorType *InTy, 673 const APInt &DemandedElts, 674 bool Insert, bool Extract) { 675 /// FIXME: a bitfield is not a reasonable abstraction for talking about 676 /// which elements are needed from a scalable vector 677 auto *Ty = cast<FixedVectorType>(InTy); 678 679 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 680 "Vector size mismatch"); 681 682 InstructionCost Cost = 0; 683 684 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 685 if (!DemandedElts[i]) 686 continue; 687 if (Insert) 688 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); 689 if (Extract) 690 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 691 } 692 693 return Cost; 694 } 695 696 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)697 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 698 bool Extract) { 699 auto *Ty = cast<FixedVectorType>(InTy); 700 701 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 702 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 703 } 704 705 /// Estimate the overhead of scalarizing an instructions unique 706 /// non-constant operands. The (potentially vector) types to use for each of 707 /// argument are passes via Tys. getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)708 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 709 ArrayRef<Type *> Tys) { 710 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 711 712 InstructionCost Cost = 0; 713 SmallPtrSet<const Value*, 4> UniqueOperands; 714 for (int I = 0, E = Args.size(); I != E; I++) { 715 // Disregard things like metadata arguments. 716 const Value *A = Args[I]; 717 Type *Ty = Tys[I]; 718 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 719 !Ty->isPtrOrPtrVectorTy()) 720 continue; 721 722 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 723 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 724 Cost += getScalarizationOverhead(VecTy, false, true); 725 } 726 } 727 728 return Cost; 729 } 730 731 /// Estimate the overhead of scalarizing the inputs and outputs of an 732 /// instruction, with return type RetTy and arguments Args of type Tys. If 733 /// Args are unknown (empty), then the cost associated with one argument is 734 /// added as a heuristic. getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)735 InstructionCost getScalarizationOverhead(VectorType *RetTy, 736 ArrayRef<const Value *> Args, 737 ArrayRef<Type *> Tys) { 738 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); 739 if (!Args.empty()) 740 Cost += getOperandsScalarizationOverhead(Args, Tys); 741 else 742 // When no information on arguments is provided, we add the cost 743 // associated with one argument as a heuristic. 744 Cost += getScalarizationOverhead(RetTy, false, true); 745 746 return Cost; 747 } 748 getMaxInterleaveFactor(unsigned VF)749 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 750 751 InstructionCost getArithmeticInstrCost( 752 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 753 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 754 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 755 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 756 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 757 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 758 const Instruction *CxtI = nullptr) { 759 // Check if any of the operands are vector operands. 760 const TargetLoweringBase *TLI = getTLI(); 761 int ISD = TLI->InstructionOpcodeToISD(Opcode); 762 assert(ISD && "Invalid opcode"); 763 764 // TODO: Handle more cost kinds. 765 if (CostKind != TTI::TCK_RecipThroughput) 766 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 767 Opd1Info, Opd2Info, 768 Opd1PropInfo, Opd2PropInfo, 769 Args, CxtI); 770 771 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 772 773 bool IsFloat = Ty->isFPOrFPVectorTy(); 774 // Assume that floating point arithmetic operations cost twice as much as 775 // integer operations. 776 InstructionCost OpCost = (IsFloat ? 2 : 1); 777 778 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 779 // The operation is legal. Assume it costs 1. 780 // TODO: Once we have extract/insert subvector cost we need to use them. 781 return LT.first * OpCost; 782 } 783 784 if (!TLI->isOperationExpand(ISD, LT.second)) { 785 // If the operation is custom lowered, then assume that the code is twice 786 // as expensive. 787 return LT.first * 2 * OpCost; 788 } 789 790 // An 'Expand' of URem and SRem is special because it may default 791 // to expanding the operation into a sequence of sub-operations 792 // i.e. X % Y -> X-(X/Y)*Y. 793 if (ISD == ISD::UREM || ISD == ISD::SREM) { 794 bool IsSigned = ISD == ISD::SREM; 795 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 796 LT.second) || 797 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 798 LT.second)) { 799 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 800 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 801 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, 802 Opd2PropInfo); 803 InstructionCost MulCost = 804 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 805 InstructionCost SubCost = 806 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 807 return DivCost + MulCost + SubCost; 808 } 809 } 810 811 // We cannot scalarize scalable vectors, so return Invalid. 812 if (isa<ScalableVectorType>(Ty)) 813 return InstructionCost::getInvalid(); 814 815 // Else, assume that we need to scalarize this op. 816 // TODO: If one of the types get legalized by splitting, handle this 817 // similarly to what getCastInstrCost() does. 818 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 819 InstructionCost Cost = thisT()->getArithmeticInstrCost( 820 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 821 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 822 // Return the cost of multiple scalar invocation plus the cost of 823 // inserting and extracting the values. 824 SmallVector<Type *> Tys(Args.size(), Ty); 825 return getScalarizationOverhead(VTy, Args, Tys) + 826 VTy->getNumElements() * Cost; 827 } 828 829 // We don't know anything about this scalar instruction. 830 return OpCost; 831 } 832 improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask)833 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 834 ArrayRef<int> Mask) const { 835 int Limit = Mask.size() * 2; 836 if (Mask.empty() || 837 // Extra check required by isSingleSourceMaskImpl function (called by 838 // ShuffleVectorInst::isSingleSourceMask). 839 any_of(Mask, [Limit](int I) { return I >= Limit; })) 840 return Kind; 841 switch (Kind) { 842 case TTI::SK_PermuteSingleSrc: 843 if (ShuffleVectorInst::isReverseMask(Mask)) 844 return TTI::SK_Reverse; 845 if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) 846 return TTI::SK_Broadcast; 847 break; 848 case TTI::SK_PermuteTwoSrc: 849 if (ShuffleVectorInst::isSelectMask(Mask)) 850 return TTI::SK_Select; 851 if (ShuffleVectorInst::isTransposeMask(Mask)) 852 return TTI::SK_Transpose; 853 break; 854 case TTI::SK_Select: 855 case TTI::SK_Reverse: 856 case TTI::SK_Broadcast: 857 case TTI::SK_Transpose: 858 case TTI::SK_InsertSubvector: 859 case TTI::SK_ExtractSubvector: 860 case TTI::SK_Splice: 861 break; 862 } 863 return Kind; 864 } 865 getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp)866 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 867 ArrayRef<int> Mask, int Index, 868 VectorType *SubTp) { 869 870 switch (improveShuffleKindFromMask(Kind, Mask)) { 871 case TTI::SK_Broadcast: 872 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); 873 case TTI::SK_Select: 874 case TTI::SK_Splice: 875 case TTI::SK_Reverse: 876 case TTI::SK_Transpose: 877 case TTI::SK_PermuteSingleSrc: 878 case TTI::SK_PermuteTwoSrc: 879 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); 880 case TTI::SK_ExtractSubvector: 881 return getExtractSubvectorOverhead(Tp, Index, 882 cast<FixedVectorType>(SubTp)); 883 case TTI::SK_InsertSubvector: 884 return getInsertSubvectorOverhead(Tp, Index, 885 cast<FixedVectorType>(SubTp)); 886 } 887 llvm_unreachable("Unknown TTI::ShuffleKind"); 888 } 889 890 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 891 TTI::CastContextHint CCH, 892 TTI::TargetCostKind CostKind, 893 const Instruction *I = nullptr) { 894 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 895 return 0; 896 897 const TargetLoweringBase *TLI = getTLI(); 898 int ISD = TLI->InstructionOpcodeToISD(Opcode); 899 assert(ISD && "Invalid opcode"); 900 std::pair<InstructionCost, MVT> SrcLT = 901 TLI->getTypeLegalizationCost(DL, Src); 902 std::pair<InstructionCost, MVT> DstLT = 903 TLI->getTypeLegalizationCost(DL, Dst); 904 905 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 906 TypeSize DstSize = DstLT.second.getSizeInBits(); 907 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 908 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 909 910 switch (Opcode) { 911 default: 912 break; 913 case Instruction::Trunc: 914 // Check for NOOP conversions. 915 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 916 return 0; 917 LLVM_FALLTHROUGH; 918 case Instruction::BitCast: 919 // Bitcast between types that are legalized to the same type are free and 920 // assume int to/from ptr of the same size is also free. 921 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 922 SrcSize == DstSize) 923 return 0; 924 break; 925 case Instruction::FPExt: 926 if (I && getTLI()->isExtFree(I)) 927 return 0; 928 break; 929 case Instruction::ZExt: 930 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 931 return 0; 932 LLVM_FALLTHROUGH; 933 case Instruction::SExt: 934 if (I && getTLI()->isExtFree(I)) 935 return 0; 936 937 // If this is a zext/sext of a load, return 0 if the corresponding 938 // extending load exists on target and the result type is legal. 939 if (CCH == TTI::CastContextHint::Normal) { 940 EVT ExtVT = EVT::getEVT(Dst); 941 EVT LoadVT = EVT::getEVT(Src); 942 unsigned LType = 943 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 944 if (DstLT.first == SrcLT.first && 945 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 946 return 0; 947 } 948 break; 949 case Instruction::AddrSpaceCast: 950 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 951 Dst->getPointerAddressSpace())) 952 return 0; 953 break; 954 } 955 956 auto *SrcVTy = dyn_cast<VectorType>(Src); 957 auto *DstVTy = dyn_cast<VectorType>(Dst); 958 959 // If the cast is marked as legal (or promote) then assume low cost. 960 if (SrcLT.first == DstLT.first && 961 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 962 return SrcLT.first; 963 964 // Handle scalar conversions. 965 if (!SrcVTy && !DstVTy) { 966 // Just check the op cost. If the operation is legal then assume it costs 967 // 1. 968 if (!TLI->isOperationExpand(ISD, DstLT.second)) 969 return 1; 970 971 // Assume that illegal scalar instruction are expensive. 972 return 4; 973 } 974 975 // Check vector-to-vector casts. 976 if (DstVTy && SrcVTy) { 977 // If the cast is between same-sized registers, then the check is simple. 978 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 979 980 // Assume that Zext is done using AND. 981 if (Opcode == Instruction::ZExt) 982 return SrcLT.first; 983 984 // Assume that sext is done using SHL and SRA. 985 if (Opcode == Instruction::SExt) 986 return SrcLT.first * 2; 987 988 // Just check the op cost. If the operation is legal then assume it 989 // costs 990 // 1 and multiply by the type-legalization overhead. 991 if (!TLI->isOperationExpand(ISD, DstLT.second)) 992 return SrcLT.first * 1; 993 } 994 995 // If we are legalizing by splitting, query the concrete TTI for the cost 996 // of casting the original vector twice. We also need to factor in the 997 // cost of the split itself. Count that as 1, to be consistent with 998 // TLI->getTypeLegalizationCost(). 999 bool SplitSrc = 1000 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1001 TargetLowering::TypeSplitVector; 1002 bool SplitDst = 1003 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1004 TargetLowering::TypeSplitVector; 1005 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1006 DstVTy->getElementCount().isVector()) { 1007 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1008 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1009 T *TTI = static_cast<T *>(this); 1010 // If both types need to be split then the split is free. 1011 InstructionCost SplitCost = 1012 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1013 return SplitCost + 1014 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1015 CostKind, I)); 1016 } 1017 1018 // Scalarization cost is Invalid, can't assume any num elements. 1019 if (isa<ScalableVectorType>(DstVTy)) 1020 return InstructionCost::getInvalid(); 1021 1022 // In other cases where the source or destination are illegal, assume 1023 // the operation will get scalarized. 1024 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1025 InstructionCost Cost = thisT()->getCastInstrCost( 1026 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1027 1028 // Return the cost of multiple scalar invocation plus the cost of 1029 // inserting and extracting the values. 1030 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; 1031 } 1032 1033 // We already handled vector-to-vector and scalar-to-scalar conversions. 1034 // This 1035 // is where we handle bitcast between vectors and scalars. We need to assume 1036 // that the conversion is scalarized in one way or another. 1037 if (Opcode == Instruction::BitCast) { 1038 // Illegal bitcasts are done by storing and loading from a stack slot. 1039 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + 1040 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); 1041 } 1042 1043 llvm_unreachable("Unhandled cast"); 1044 } 1045 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1046 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1047 VectorType *VecTy, unsigned Index) { 1048 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1049 Index) + 1050 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1051 TTI::CastContextHint::None, 1052 TTI::TCK_RecipThroughput); 1053 } 1054 1055 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1056 const Instruction *I = nullptr) { 1057 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1058 } 1059 1060 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1061 CmpInst::Predicate VecPred, 1062 TTI::TargetCostKind CostKind, 1063 const Instruction *I = nullptr) { 1064 const TargetLoweringBase *TLI = getTLI(); 1065 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1066 assert(ISD && "Invalid opcode"); 1067 1068 // TODO: Handle other cost kinds. 1069 if (CostKind != TTI::TCK_RecipThroughput) 1070 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1071 I); 1072 1073 // Selects on vectors are actually vector selects. 1074 if (ISD == ISD::SELECT) { 1075 assert(CondTy && "CondTy must exist"); 1076 if (CondTy->isVectorTy()) 1077 ISD = ISD::VSELECT; 1078 } 1079 std::pair<InstructionCost, MVT> LT = 1080 TLI->getTypeLegalizationCost(DL, ValTy); 1081 1082 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1083 !TLI->isOperationExpand(ISD, LT.second)) { 1084 // The operation is legal. Assume it costs 1. Multiply 1085 // by the type-legalization overhead. 1086 return LT.first * 1; 1087 } 1088 1089 // Otherwise, assume that the cast is scalarized. 1090 // TODO: If one of the types get legalized by splitting, handle this 1091 // similarly to what getCastInstrCost() does. 1092 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1093 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1094 if (CondTy) 1095 CondTy = CondTy->getScalarType(); 1096 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1097 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1098 1099 // Return the cost of multiple scalar invocation plus the cost of 1100 // inserting and extracting the values. 1101 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; 1102 } 1103 1104 // Unknown scalar opcode. 1105 return 1; 1106 } 1107 getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1108 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1109 unsigned Index) { 1110 std::pair<InstructionCost, MVT> LT = 1111 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 1112 1113 return LT.first; 1114 } 1115 1116 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, 1117 MaybeAlign Alignment, unsigned AddressSpace, 1118 TTI::TargetCostKind CostKind, 1119 const Instruction *I = nullptr) { 1120 assert(!Src->isVoidTy() && "Invalid type"); 1121 // Assume types, such as structs, are expensive. 1122 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1123 return 4; 1124 std::pair<InstructionCost, MVT> LT = 1125 getTLI()->getTypeLegalizationCost(DL, Src); 1126 1127 // Assuming that all loads of legal types cost 1. 1128 InstructionCost Cost = LT.first; 1129 if (CostKind != TTI::TCK_RecipThroughput) 1130 return Cost; 1131 1132 if (Src->isVectorTy() && 1133 // In practice it's not currently possible to have a change in lane 1134 // length for extending loads or truncating stores so both types should 1135 // have the same scalable property. 1136 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), 1137 LT.second.getSizeInBits())) { 1138 // This is a vector load that legalizes to a larger type than the vector 1139 // itself. Unless the corresponding extending load or truncating store is 1140 // legal, then this will scalarize. 1141 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1142 EVT MemVT = getTLI()->getValueType(DL, Src); 1143 if (Opcode == Instruction::Store) 1144 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1145 else 1146 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1147 1148 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1149 // This is a vector load/store for some illegal type that is scalarized. 1150 // We must account for the cost of building or decomposing the vector. 1151 Cost += getScalarizationOverhead(cast<VectorType>(Src), 1152 Opcode != Instruction::Store, 1153 Opcode == Instruction::Store); 1154 } 1155 } 1156 1157 return Cost; 1158 } 1159 getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1160 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1161 Align Alignment, unsigned AddressSpace, 1162 TTI::TargetCostKind CostKind) { 1163 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1164 CostKind); 1165 } 1166 1167 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1168 const Value *Ptr, bool VariableMask, 1169 Align Alignment, 1170 TTI::TargetCostKind CostKind, 1171 const Instruction *I = nullptr) { 1172 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1173 true, CostKind); 1174 } 1175 1176 InstructionCost getInterleavedMemoryOpCost( 1177 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1178 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1179 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1180 auto *VT = cast<FixedVectorType>(VecTy); 1181 1182 unsigned NumElts = VT->getNumElements(); 1183 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1184 1185 unsigned NumSubElts = NumElts / Factor; 1186 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1187 1188 // Firstly, the cost of load/store operation. 1189 InstructionCost Cost; 1190 if (UseMaskForCond || UseMaskForGaps) 1191 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1192 AddressSpace, CostKind); 1193 else 1194 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1195 CostKind); 1196 1197 // Legalize the vector type, and get the legalized and unlegalized type 1198 // sizes. 1199 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 1200 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1201 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1202 1203 // Scale the cost of the memory operation by the fraction of legalized 1204 // instructions that will actually be used. We shouldn't account for the 1205 // cost of dead instructions since they will be removed. 1206 // 1207 // E.g., An interleaved load of factor 8: 1208 // %vec = load <16 x i64>, <16 x i64>* %ptr 1209 // %v0 = shufflevector %vec, undef, <0, 8> 1210 // 1211 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1212 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1213 // type). The other loads are unused. 1214 // 1215 // TODO: Note that legalization can turn masked loads/stores into unmasked 1216 // (legalized) loads/stores. This can be reflected in the cost. 1217 if (VecTySize > VecTyLTSize) { 1218 // The number of loads of a legal type it will take to represent a load 1219 // of the unlegalized vector type. 1220 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1221 1222 // The number of elements of the unlegalized type that correspond to a 1223 // single legal instruction. 1224 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1225 1226 // Determine which legal instructions will be used. 1227 BitVector UsedInsts(NumLegalInsts, false); 1228 for (unsigned Index : Indices) 1229 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1230 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1231 1232 // Scale the cost of the load by the fraction of legal instructions that 1233 // will be used. 1234 Cost *= UsedInsts.count() / NumLegalInsts; 1235 } 1236 1237 // Then plus the cost of interleave operation. 1238 assert(Indices.size() <= Factor && 1239 "Interleaved memory op has too many members"); 1240 1241 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1242 for (unsigned Index : Indices) { 1243 assert(Index < Factor && "Invalid index for interleaved memory op"); 1244 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1245 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1246 } 1247 1248 if (Opcode == Instruction::Load) { 1249 // The interleave cost is similar to extract sub vectors' elements 1250 // from the wide vector, and insert them into sub vectors. 1251 // 1252 // E.g. An interleaved load of factor 2 (with one member of index 0): 1253 // %vec = load <8 x i32>, <8 x i32>* %ptr 1254 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1255 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1256 // <8 x i32> vector and insert them into a <4 x i32> vector. 1257 InstructionCost InsSubCost = 1258 getScalarizationOverhead(SubVT, /*Insert*/ true, /*Extract*/ false); 1259 Cost += Indices.size() * InsSubCost; 1260 Cost += 1261 thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1262 /*Insert*/ false, /*Extract*/ true); 1263 } else { 1264 // The interleave cost is extract elements from sub vectors, and 1265 // insert them into the wide vector. 1266 // 1267 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1268 // (using VF=4): 1269 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1270 // %gaps.mask = <true, true, false, true, true, false, 1271 // true, true, false, true, true, false> 1272 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1273 // i32 Align, <12 x i1> %gaps.mask 1274 // The cost is estimated as extract all elements (of actual members, 1275 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1276 // i32> vector. 1277 InstructionCost ExtSubCost = 1278 getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); 1279 Cost += ExtSubCost * Indices.size(); 1280 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1281 /*Insert*/ true, 1282 /*Extract*/ false); 1283 } 1284 1285 if (!UseMaskForCond) 1286 return Cost; 1287 1288 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1289 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1290 SubVT = FixedVectorType::get(I8Type, NumSubElts); 1291 1292 // The Mask shuffling cost is extract all the elements of the Mask 1293 // and insert each of them Factor times into the wide vector: 1294 // 1295 // E.g. an interleaved group with factor 3: 1296 // %mask = icmp ult <8 x i32> %vec1, %vec2 1297 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1298 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1299 // The cost is estimated as extract all mask elements from the <8xi1> mask 1300 // vector and insert them factor times into the <24xi1> shuffled mask 1301 // vector. 1302 Cost += getScalarizationOverhead(SubVT, /*Insert*/ false, /*Extract*/ true); 1303 Cost += 1304 getScalarizationOverhead(MaskVT, /*Insert*/ true, /*Extract*/ false); 1305 1306 // The Gaps mask is invariant and created outside the loop, therefore the 1307 // cost of creating it is not accounted for here. However if we have both 1308 // a MaskForGaps and some other mask that guards the execution of the 1309 // memory access, we need to account for the cost of And-ing the two masks 1310 // inside the loop. 1311 if (UseMaskForGaps) 1312 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1313 CostKind); 1314 1315 return Cost; 1316 } 1317 1318 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1319 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1320 TTI::TargetCostKind CostKind) { 1321 // Check for generically free intrinsics. 1322 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1323 return 0; 1324 1325 // Assume that target intrinsics are cheap. 1326 Intrinsic::ID IID = ICA.getID(); 1327 if (Function::isTargetIntrinsic(IID)) 1328 return TargetTransformInfo::TCC_Basic; 1329 1330 if (ICA.isTypeBasedOnly()) 1331 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1332 1333 Type *RetTy = ICA.getReturnType(); 1334 1335 ElementCount RetVF = 1336 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1337 : ElementCount::getFixed(1)); 1338 const IntrinsicInst *I = ICA.getInst(); 1339 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1340 FastMathFlags FMF = ICA.getFlags(); 1341 switch (IID) { 1342 default: 1343 break; 1344 1345 case Intrinsic::cttz: 1346 // FIXME: If necessary, this should go in target-specific overrides. 1347 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) 1348 return TargetTransformInfo::TCC_Basic; 1349 break; 1350 1351 case Intrinsic::ctlz: 1352 // FIXME: If necessary, this should go in target-specific overrides. 1353 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) 1354 return TargetTransformInfo::TCC_Basic; 1355 break; 1356 1357 case Intrinsic::memcpy: 1358 return thisT()->getMemcpyCost(ICA.getInst()); 1359 1360 case Intrinsic::masked_scatter: { 1361 const Value *Mask = Args[3]; 1362 bool VarMask = !isa<Constant>(Mask); 1363 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1364 return thisT()->getGatherScatterOpCost(Instruction::Store, 1365 ICA.getArgTypes()[0], Args[1], 1366 VarMask, Alignment, CostKind, I); 1367 } 1368 case Intrinsic::masked_gather: { 1369 const Value *Mask = Args[2]; 1370 bool VarMask = !isa<Constant>(Mask); 1371 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1372 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1373 VarMask, Alignment, CostKind, I); 1374 } 1375 case Intrinsic::experimental_stepvector: { 1376 if (isa<ScalableVectorType>(RetTy)) 1377 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1378 // The cost of materialising a constant integer vector. 1379 return TargetTransformInfo::TCC_Basic; 1380 } 1381 case Intrinsic::experimental_vector_extract: { 1382 // FIXME: Handle case where a scalable vector is extracted from a scalable 1383 // vector 1384 if (isa<ScalableVectorType>(RetTy)) 1385 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1386 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1387 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1388 cast<VectorType>(Args[0]->getType()), None, 1389 Index, cast<VectorType>(RetTy)); 1390 } 1391 case Intrinsic::experimental_vector_insert: { 1392 // FIXME: Handle case where a scalable vector is inserted into a scalable 1393 // vector 1394 if (isa<ScalableVectorType>(Args[1]->getType())) 1395 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1396 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1397 return thisT()->getShuffleCost( 1398 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, 1399 Index, cast<VectorType>(Args[1]->getType())); 1400 } 1401 case Intrinsic::experimental_vector_reverse: { 1402 return thisT()->getShuffleCost(TTI::SK_Reverse, 1403 cast<VectorType>(Args[0]->getType()), None, 1404 0, cast<VectorType>(RetTy)); 1405 } 1406 case Intrinsic::experimental_vector_splice: { 1407 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1408 return thisT()->getShuffleCost(TTI::SK_Splice, 1409 cast<VectorType>(Args[0]->getType()), None, 1410 Index, cast<VectorType>(RetTy)); 1411 } 1412 case Intrinsic::vector_reduce_add: 1413 case Intrinsic::vector_reduce_mul: 1414 case Intrinsic::vector_reduce_and: 1415 case Intrinsic::vector_reduce_or: 1416 case Intrinsic::vector_reduce_xor: 1417 case Intrinsic::vector_reduce_smax: 1418 case Intrinsic::vector_reduce_smin: 1419 case Intrinsic::vector_reduce_fmax: 1420 case Intrinsic::vector_reduce_fmin: 1421 case Intrinsic::vector_reduce_umax: 1422 case Intrinsic::vector_reduce_umin: { 1423 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1424 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1425 } 1426 case Intrinsic::vector_reduce_fadd: 1427 case Intrinsic::vector_reduce_fmul: { 1428 IntrinsicCostAttributes Attrs( 1429 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1430 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1431 } 1432 case Intrinsic::fshl: 1433 case Intrinsic::fshr: { 1434 if (isa<ScalableVectorType>(RetTy)) 1435 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1436 const Value *X = Args[0]; 1437 const Value *Y = Args[1]; 1438 const Value *Z = Args[2]; 1439 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1440 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1441 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1442 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1443 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1444 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1445 : TTI::OP_None; 1446 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1447 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1448 InstructionCost Cost = 0; 1449 Cost += 1450 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1451 Cost += 1452 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1453 Cost += thisT()->getArithmeticInstrCost( 1454 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); 1455 Cost += thisT()->getArithmeticInstrCost( 1456 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); 1457 // Non-constant shift amounts requires a modulo. 1458 if (OpKindZ != TTI::OK_UniformConstantValue && 1459 OpKindZ != TTI::OK_NonUniformConstantValue) 1460 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1461 CostKind, OpKindZ, OpKindBW, 1462 OpPropsZ, OpPropsBW); 1463 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1464 if (X != Y) { 1465 Type *CondTy = RetTy->getWithNewBitWidth(1); 1466 Cost += 1467 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1468 CmpInst::ICMP_EQ, CostKind); 1469 Cost += 1470 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1471 CmpInst::ICMP_EQ, CostKind); 1472 } 1473 return Cost; 1474 } 1475 } 1476 1477 // Assume that we need to scalarize this intrinsic. 1478 // Compute the scalarization overhead based on Args for a vector 1479 // intrinsic. 1480 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1481 if (RetVF.isVector() && !RetVF.isScalable()) { 1482 ScalarizationCost = 0; 1483 if (!RetTy->isVoidTy()) 1484 ScalarizationCost += 1485 getScalarizationOverhead(cast<VectorType>(RetTy), true, false); 1486 ScalarizationCost += 1487 getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); 1488 } 1489 1490 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1491 ScalarizationCost); 1492 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1493 } 1494 1495 /// Get intrinsic cost based on argument types. 1496 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1497 /// cost of scalarizing the arguments and the return value will be computed 1498 /// based on types. 1499 InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1500 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1501 TTI::TargetCostKind CostKind) { 1502 Intrinsic::ID IID = ICA.getID(); 1503 Type *RetTy = ICA.getReturnType(); 1504 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1505 FastMathFlags FMF = ICA.getFlags(); 1506 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1507 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1508 1509 VectorType *VecOpTy = nullptr; 1510 if (!Tys.empty()) { 1511 // The vector reduction operand is operand 0 except for fadd/fmul. 1512 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1513 unsigned VecTyIndex = 0; 1514 if (IID == Intrinsic::vector_reduce_fadd || 1515 IID == Intrinsic::vector_reduce_fmul) 1516 VecTyIndex = 1; 1517 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1518 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1519 } 1520 1521 // Library call cost - other than size, make it expensive. 1522 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1523 SmallVector<unsigned, 2> ISDs; 1524 switch (IID) { 1525 default: { 1526 // Scalable vectors cannot be scalarized, so return Invalid. 1527 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1528 return isa<ScalableVectorType>(Ty); 1529 })) 1530 return InstructionCost::getInvalid(); 1531 1532 // Assume that we need to scalarize this intrinsic. 1533 InstructionCost ScalarizationCost = 1534 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1535 unsigned ScalarCalls = 1; 1536 Type *ScalarRetTy = RetTy; 1537 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1538 if (!SkipScalarizationCost) 1539 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); 1540 ScalarCalls = std::max(ScalarCalls, 1541 cast<FixedVectorType>(RetVTy)->getNumElements()); 1542 ScalarRetTy = RetTy->getScalarType(); 1543 } 1544 SmallVector<Type *, 4> ScalarTys; 1545 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1546 Type *Ty = Tys[i]; 1547 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1548 if (!SkipScalarizationCost) 1549 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1550 ScalarCalls = std::max(ScalarCalls, 1551 cast<FixedVectorType>(VTy)->getNumElements()); 1552 Ty = Ty->getScalarType(); 1553 } 1554 ScalarTys.push_back(Ty); 1555 } 1556 if (ScalarCalls == 1) 1557 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1558 1559 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1560 InstructionCost ScalarCost = 1561 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1562 1563 return ScalarCalls * ScalarCost + ScalarizationCost; 1564 } 1565 // Look for intrinsics that can be lowered directly or turned into a scalar 1566 // intrinsic call. 1567 case Intrinsic::sqrt: 1568 ISDs.push_back(ISD::FSQRT); 1569 break; 1570 case Intrinsic::sin: 1571 ISDs.push_back(ISD::FSIN); 1572 break; 1573 case Intrinsic::cos: 1574 ISDs.push_back(ISD::FCOS); 1575 break; 1576 case Intrinsic::exp: 1577 ISDs.push_back(ISD::FEXP); 1578 break; 1579 case Intrinsic::exp2: 1580 ISDs.push_back(ISD::FEXP2); 1581 break; 1582 case Intrinsic::log: 1583 ISDs.push_back(ISD::FLOG); 1584 break; 1585 case Intrinsic::log10: 1586 ISDs.push_back(ISD::FLOG10); 1587 break; 1588 case Intrinsic::log2: 1589 ISDs.push_back(ISD::FLOG2); 1590 break; 1591 case Intrinsic::fabs: 1592 ISDs.push_back(ISD::FABS); 1593 break; 1594 case Intrinsic::canonicalize: 1595 ISDs.push_back(ISD::FCANONICALIZE); 1596 break; 1597 case Intrinsic::minnum: 1598 ISDs.push_back(ISD::FMINNUM); 1599 break; 1600 case Intrinsic::maxnum: 1601 ISDs.push_back(ISD::FMAXNUM); 1602 break; 1603 case Intrinsic::minimum: 1604 ISDs.push_back(ISD::FMINIMUM); 1605 break; 1606 case Intrinsic::maximum: 1607 ISDs.push_back(ISD::FMAXIMUM); 1608 break; 1609 case Intrinsic::copysign: 1610 ISDs.push_back(ISD::FCOPYSIGN); 1611 break; 1612 case Intrinsic::floor: 1613 ISDs.push_back(ISD::FFLOOR); 1614 break; 1615 case Intrinsic::ceil: 1616 ISDs.push_back(ISD::FCEIL); 1617 break; 1618 case Intrinsic::trunc: 1619 ISDs.push_back(ISD::FTRUNC); 1620 break; 1621 case Intrinsic::nearbyint: 1622 ISDs.push_back(ISD::FNEARBYINT); 1623 break; 1624 case Intrinsic::rint: 1625 ISDs.push_back(ISD::FRINT); 1626 break; 1627 case Intrinsic::round: 1628 ISDs.push_back(ISD::FROUND); 1629 break; 1630 case Intrinsic::roundeven: 1631 ISDs.push_back(ISD::FROUNDEVEN); 1632 break; 1633 case Intrinsic::pow: 1634 ISDs.push_back(ISD::FPOW); 1635 break; 1636 case Intrinsic::fma: 1637 ISDs.push_back(ISD::FMA); 1638 break; 1639 case Intrinsic::fmuladd: 1640 ISDs.push_back(ISD::FMA); 1641 break; 1642 case Intrinsic::experimental_constrained_fmuladd: 1643 ISDs.push_back(ISD::STRICT_FMA); 1644 break; 1645 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1646 case Intrinsic::lifetime_start: 1647 case Intrinsic::lifetime_end: 1648 case Intrinsic::sideeffect: 1649 case Intrinsic::pseudoprobe: 1650 case Intrinsic::arithmetic_fence: 1651 return 0; 1652 case Intrinsic::masked_store: { 1653 Type *Ty = Tys[0]; 1654 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1655 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1656 CostKind); 1657 } 1658 case Intrinsic::masked_load: { 1659 Type *Ty = RetTy; 1660 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1661 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1662 CostKind); 1663 } 1664 case Intrinsic::vector_reduce_add: 1665 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1666 None, CostKind); 1667 case Intrinsic::vector_reduce_mul: 1668 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1669 None, CostKind); 1670 case Intrinsic::vector_reduce_and: 1671 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1672 None, CostKind); 1673 case Intrinsic::vector_reduce_or: 1674 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, 1675 CostKind); 1676 case Intrinsic::vector_reduce_xor: 1677 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1678 None, CostKind); 1679 case Intrinsic::vector_reduce_fadd: 1680 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 1681 FMF, CostKind); 1682 case Intrinsic::vector_reduce_fmul: 1683 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 1684 FMF, CostKind); 1685 case Intrinsic::vector_reduce_smax: 1686 case Intrinsic::vector_reduce_smin: 1687 case Intrinsic::vector_reduce_fmax: 1688 case Intrinsic::vector_reduce_fmin: 1689 return thisT()->getMinMaxReductionCost( 1690 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1691 /*IsUnsigned=*/false, CostKind); 1692 case Intrinsic::vector_reduce_umax: 1693 case Intrinsic::vector_reduce_umin: 1694 return thisT()->getMinMaxReductionCost( 1695 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1696 /*IsUnsigned=*/true, CostKind); 1697 case Intrinsic::abs: { 1698 // abs(X) = select(icmp(X,0),X,sub(0,X)) 1699 Type *CondTy = RetTy->getWithNewBitWidth(1); 1700 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 1701 InstructionCost Cost = 0; 1702 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1703 Pred, CostKind); 1704 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1705 Pred, CostKind); 1706 // TODO: Should we add an OperandValueProperties::OP_Zero property? 1707 Cost += thisT()->getArithmeticInstrCost( 1708 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); 1709 return Cost; 1710 } 1711 case Intrinsic::smax: 1712 case Intrinsic::smin: 1713 case Intrinsic::umax: 1714 case Intrinsic::umin: { 1715 // minmax(X,Y) = select(icmp(X,Y),X,Y) 1716 Type *CondTy = RetTy->getWithNewBitWidth(1); 1717 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 1718 CmpInst::Predicate Pred = 1719 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 1720 InstructionCost Cost = 0; 1721 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1722 Pred, CostKind); 1723 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1724 Pred, CostKind); 1725 return Cost; 1726 } 1727 case Intrinsic::sadd_sat: 1728 case Intrinsic::ssub_sat: { 1729 Type *CondTy = RetTy->getWithNewBitWidth(1); 1730 1731 Type *OpTy = StructType::create({RetTy, CondTy}); 1732 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1733 ? Intrinsic::sadd_with_overflow 1734 : Intrinsic::ssub_with_overflow; 1735 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 1736 1737 // SatMax -> Overflow && SumDiff < 0 1738 // SatMin -> Overflow && SumDiff >= 0 1739 InstructionCost Cost = 0; 1740 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1741 nullptr, ScalarizationCostPassed); 1742 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1743 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1744 Pred, CostKind); 1745 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 1746 CondTy, Pred, CostKind); 1747 return Cost; 1748 } 1749 case Intrinsic::uadd_sat: 1750 case Intrinsic::usub_sat: { 1751 Type *CondTy = RetTy->getWithNewBitWidth(1); 1752 1753 Type *OpTy = StructType::create({RetTy, CondTy}); 1754 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1755 ? Intrinsic::uadd_with_overflow 1756 : Intrinsic::usub_with_overflow; 1757 1758 InstructionCost Cost = 0; 1759 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1760 nullptr, ScalarizationCostPassed); 1761 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1762 Cost += 1763 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1764 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1765 return Cost; 1766 } 1767 case Intrinsic::smul_fix: 1768 case Intrinsic::umul_fix: { 1769 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1770 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1771 1772 unsigned ExtOp = 1773 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1774 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1775 1776 InstructionCost Cost = 0; 1777 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 1778 Cost += 1779 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1780 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 1781 CCH, CostKind); 1782 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 1783 CostKind, TTI::OK_AnyValue, 1784 TTI::OK_UniformConstantValue); 1785 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 1786 TTI::OK_AnyValue, 1787 TTI::OK_UniformConstantValue); 1788 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 1789 return Cost; 1790 } 1791 case Intrinsic::sadd_with_overflow: 1792 case Intrinsic::ssub_with_overflow: { 1793 Type *SumTy = RetTy->getContainedType(0); 1794 Type *OverflowTy = RetTy->getContainedType(1); 1795 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1796 ? BinaryOperator::Add 1797 : BinaryOperator::Sub; 1798 1799 // Add: 1800 // Overflow -> (Result < LHS) ^ (RHS < 0) 1801 // Sub: 1802 // Overflow -> (Result < LHS) ^ (RHS > 0) 1803 InstructionCost Cost = 0; 1804 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1805 Cost += 2 * thisT()->getCmpSelInstrCost( 1806 Instruction::ICmp, SumTy, OverflowTy, 1807 CmpInst::ICMP_SGT, CostKind); 1808 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 1809 CostKind); 1810 return Cost; 1811 } 1812 case Intrinsic::uadd_with_overflow: 1813 case Intrinsic::usub_with_overflow: { 1814 Type *SumTy = RetTy->getContainedType(0); 1815 Type *OverflowTy = RetTy->getContainedType(1); 1816 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1817 ? BinaryOperator::Add 1818 : BinaryOperator::Sub; 1819 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 1820 ? CmpInst::ICMP_ULT 1821 : CmpInst::ICMP_UGT; 1822 1823 InstructionCost Cost = 0; 1824 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1825 Cost += 1826 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 1827 Pred, CostKind); 1828 return Cost; 1829 } 1830 case Intrinsic::smul_with_overflow: 1831 case Intrinsic::umul_with_overflow: { 1832 Type *MulTy = RetTy->getContainedType(0); 1833 Type *OverflowTy = RetTy->getContainedType(1); 1834 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1835 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1836 bool IsSigned = IID == Intrinsic::smul_with_overflow; 1837 1838 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 1839 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1840 1841 InstructionCost Cost = 0; 1842 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 1843 Cost += 1844 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1845 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 1846 CCH, CostKind); 1847 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 1848 CostKind, TTI::OK_AnyValue, 1849 TTI::OK_UniformConstantValue); 1850 1851 if (IsSigned) 1852 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 1853 CostKind, TTI::OK_AnyValue, 1854 TTI::OK_UniformConstantValue); 1855 1856 Cost += thisT()->getCmpSelInstrCost( 1857 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 1858 return Cost; 1859 } 1860 case Intrinsic::ctpop: 1861 ISDs.push_back(ISD::CTPOP); 1862 // In case of legalization use TCC_Expensive. This is cheaper than a 1863 // library call but still not a cheap instruction. 1864 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1865 break; 1866 case Intrinsic::ctlz: 1867 ISDs.push_back(ISD::CTLZ); 1868 break; 1869 case Intrinsic::cttz: 1870 ISDs.push_back(ISD::CTTZ); 1871 break; 1872 case Intrinsic::bswap: 1873 ISDs.push_back(ISD::BSWAP); 1874 break; 1875 case Intrinsic::bitreverse: 1876 ISDs.push_back(ISD::BITREVERSE); 1877 break; 1878 } 1879 1880 const TargetLoweringBase *TLI = getTLI(); 1881 std::pair<InstructionCost, MVT> LT = 1882 TLI->getTypeLegalizationCost(DL, RetTy); 1883 1884 SmallVector<InstructionCost, 2> LegalCost; 1885 SmallVector<InstructionCost, 2> CustomCost; 1886 for (unsigned ISD : ISDs) { 1887 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1888 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 1889 TLI->isFAbsFree(LT.second)) { 1890 return 0; 1891 } 1892 1893 // The operation is legal. Assume it costs 1. 1894 // If the type is split to multiple registers, assume that there is some 1895 // overhead to this. 1896 // TODO: Once we have extract/insert subvector cost we need to use them. 1897 if (LT.first > 1) 1898 LegalCost.push_back(LT.first * 2); 1899 else 1900 LegalCost.push_back(LT.first * 1); 1901 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1902 // If the operation is custom lowered then assume 1903 // that the code is twice as expensive. 1904 CustomCost.push_back(LT.first * 2); 1905 } 1906 } 1907 1908 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1909 if (MinLegalCostI != LegalCost.end()) 1910 return *MinLegalCostI; 1911 1912 auto MinCustomCostI = 1913 std::min_element(CustomCost.begin(), CustomCost.end()); 1914 if (MinCustomCostI != CustomCost.end()) 1915 return *MinCustomCostI; 1916 1917 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1918 // point mul followed by an add. 1919 if (IID == Intrinsic::fmuladd) 1920 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 1921 CostKind) + 1922 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 1923 CostKind); 1924 if (IID == Intrinsic::experimental_constrained_fmuladd) { 1925 IntrinsicCostAttributes FMulAttrs( 1926 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 1927 IntrinsicCostAttributes FAddAttrs( 1928 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 1929 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 1930 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 1931 } 1932 1933 // Else, assume that we need to scalarize this intrinsic. For math builtins 1934 // this will emit a costly libcall, adding call overhead and spills. Make it 1935 // very expensive. 1936 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1937 // Scalable vectors cannot be scalarized, so return Invalid. 1938 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1939 return isa<ScalableVectorType>(Ty); 1940 })) 1941 return InstructionCost::getInvalid(); 1942 1943 InstructionCost ScalarizationCost = 1944 SkipScalarizationCost ? ScalarizationCostPassed 1945 : getScalarizationOverhead(RetVTy, true, false); 1946 1947 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 1948 SmallVector<Type *, 4> ScalarTys; 1949 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1950 Type *Ty = Tys[i]; 1951 if (Ty->isVectorTy()) 1952 Ty = Ty->getScalarType(); 1953 ScalarTys.push_back(Ty); 1954 } 1955 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 1956 InstructionCost ScalarCost = 1957 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1958 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1959 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 1960 if (!ICA.skipScalarizationCost()) 1961 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1962 ScalarCalls = std::max(ScalarCalls, 1963 cast<FixedVectorType>(VTy)->getNumElements()); 1964 } 1965 } 1966 return ScalarCalls * ScalarCost + ScalarizationCost; 1967 } 1968 1969 // This is going to be turned into a library call, make it expensive. 1970 return SingleCallCost; 1971 } 1972 1973 /// Compute a cost of the given call instruction. 1974 /// 1975 /// Compute the cost of calling function F with return type RetTy and 1976 /// argument types Tys. F might be nullptr, in this case the cost of an 1977 /// arbitrary call with the specified signature will be returned. 1978 /// This is used, for instance, when we estimate call of a vector 1979 /// counterpart of the given function. 1980 /// \param F Called function, might be nullptr. 1981 /// \param RetTy Return value types. 1982 /// \param Tys Argument types. 1983 /// \returns The cost of Call instruction. getCallInstrCost(Function * F,Type * RetTy,ArrayRef<Type * > Tys,TTI::TargetCostKind CostKind)1984 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 1985 ArrayRef<Type *> Tys, 1986 TTI::TargetCostKind CostKind) { 1987 return 10; 1988 } 1989 getNumberOfParts(Type * Tp)1990 unsigned getNumberOfParts(Type *Tp) { 1991 std::pair<InstructionCost, MVT> LT = 1992 getTLI()->getTypeLegalizationCost(DL, Tp); 1993 return *LT.first.getValue(); 1994 } 1995 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1996 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 1997 const SCEV *) { 1998 return 0; 1999 } 2000 2001 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2002 /// We're assuming that reduction operation are performing the following way: 2003 /// 2004 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2005 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2006 /// \----------------v-------------/ \----------v------------/ 2007 /// n/2 elements n/2 elements 2008 /// %red1 = op <n x t> %val, <n x t> val1 2009 /// After this operation we have a vector %red1 where only the first n/2 2010 /// elements are meaningful, the second n/2 elements are undefined and can be 2011 /// dropped. All other operations are actually working with the vector of 2012 /// length n/2, not n, though the real vector length is still n. 2013 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2014 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2015 /// \----------------v-------------/ \----------v------------/ 2016 /// n/4 elements 3*n/4 elements 2017 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2018 /// length n/2, the resulting vector has length n/4 etc. 2019 /// 2020 /// The cost model should take into account that the actual length of the 2021 /// vector is reduced on each iteration. getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2022 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2023 TTI::TargetCostKind CostKind) { 2024 Type *ScalarTy = Ty->getElementType(); 2025 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2026 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2027 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2028 NumVecElts >= 2) { 2029 // Or reduction for i1 is represented as: 2030 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2031 // %res = cmp ne iReduxWidth %val, 0 2032 // And reduction for i1 is represented as: 2033 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2034 // %res = cmp eq iReduxWidth %val, 11111 2035 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2036 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2037 TTI::CastContextHint::None, CostKind) + 2038 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2039 CmpInst::makeCmpResultType(ValTy), 2040 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2041 } 2042 unsigned NumReduxLevels = Log2_32(NumVecElts); 2043 InstructionCost ArithCost = 0; 2044 InstructionCost ShuffleCost = 0; 2045 std::pair<InstructionCost, MVT> LT = 2046 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2047 unsigned LongVectorCount = 0; 2048 unsigned MVTLen = 2049 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2050 while (NumVecElts > MVTLen) { 2051 NumVecElts /= 2; 2052 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2053 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2054 NumVecElts, SubTy); 2055 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2056 Ty = SubTy; 2057 ++LongVectorCount; 2058 } 2059 2060 NumReduxLevels -= LongVectorCount; 2061 2062 // The minimal length of the vector is limited by the real length of vector 2063 // operations performed on the current platform. That's why several final 2064 // reduction operations are performed on the vectors with the same 2065 // architecture-dependent length. 2066 2067 // By default reductions need one shuffle per reduction level. 2068 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2069 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2070 ArithCost += 2071 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2072 return ShuffleCost + ArithCost + 2073 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2074 } 2075 2076 /// Try to calculate the cost of performing strict (in-order) reductions, 2077 /// which involves doing a sequence of floating point additions in lane 2078 /// order, starting with an initial value. For example, consider a scalar 2079 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2080 /// 2081 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2082 /// 2083 /// %add1 = %InitVal + %v0 2084 /// %add2 = %add1 + %v1 2085 /// %add3 = %add2 + %v2 2086 /// %add4 = %add3 + %v3 2087 /// 2088 /// As a simple estimate we can say the cost of such a reduction is 4 times 2089 /// the cost of a scalar FP addition. We can only estimate the costs for 2090 /// fixed-width vectors here because for scalable vectors we do not know the 2091 /// runtime number of operations. getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2092 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2093 TTI::TargetCostKind CostKind) { 2094 // Targets must implement a default value for the scalable case, since 2095 // we don't know how many lanes the vector has. 2096 if (isa<ScalableVectorType>(Ty)) 2097 return InstructionCost::getInvalid(); 2098 2099 auto *VTy = cast<FixedVectorType>(Ty); 2100 InstructionCost ExtractCost = 2101 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); 2102 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2103 Opcode, VTy->getElementType(), CostKind); 2104 ArithCost *= VTy->getNumElements(); 2105 2106 return ExtractCost + ArithCost; 2107 } 2108 getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2109 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2110 Optional<FastMathFlags> FMF, 2111 TTI::TargetCostKind CostKind) { 2112 if (TTI::requiresOrderedReduction(FMF)) 2113 return getOrderedReductionCost(Opcode, Ty, CostKind); 2114 return getTreeReductionCost(Opcode, Ty, CostKind); 2115 } 2116 2117 /// Try to calculate op costs for min/max reduction operations. 2118 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)2119 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2120 bool IsUnsigned, 2121 TTI::TargetCostKind CostKind) { 2122 Type *ScalarTy = Ty->getElementType(); 2123 Type *ScalarCondTy = CondTy->getElementType(); 2124 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2125 unsigned NumReduxLevels = Log2_32(NumVecElts); 2126 unsigned CmpOpcode; 2127 if (Ty->isFPOrFPVectorTy()) { 2128 CmpOpcode = Instruction::FCmp; 2129 } else { 2130 assert(Ty->isIntOrIntVectorTy() && 2131 "expecting floating point or integer type for min/max reduction"); 2132 CmpOpcode = Instruction::ICmp; 2133 } 2134 InstructionCost MinMaxCost = 0; 2135 InstructionCost ShuffleCost = 0; 2136 std::pair<InstructionCost, MVT> LT = 2137 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2138 unsigned LongVectorCount = 0; 2139 unsigned MVTLen = 2140 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2141 while (NumVecElts > MVTLen) { 2142 NumVecElts /= 2; 2143 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2144 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); 2145 2146 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2147 NumVecElts, SubTy); 2148 MinMaxCost += 2149 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, 2150 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2151 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 2152 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2153 Ty = SubTy; 2154 ++LongVectorCount; 2155 } 2156 2157 NumReduxLevels -= LongVectorCount; 2158 2159 // The minimal length of the vector is limited by the real length of vector 2160 // operations performed on the current platform. That's why several final 2161 // reduction opertions are perfomed on the vectors with the same 2162 // architecture-dependent length. 2163 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2164 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2165 MinMaxCost += 2166 NumReduxLevels * 2167 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, 2168 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2169 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 2170 CmpInst::BAD_ICMP_PREDICATE, CostKind)); 2171 // The last min/max should be in vector registers and we counted it above. 2172 // So just need a single extractelement. 2173 return ShuffleCost + MinMaxCost + 2174 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2175 } 2176 getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2177 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 2178 Type *ResTy, VectorType *Ty, 2179 TTI::TargetCostKind CostKind) { 2180 // Without any native support, this is equivalent to the cost of 2181 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) 2182 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2183 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2184 Instruction::Add, ExtTy, None, CostKind); 2185 InstructionCost MulCost = 0; 2186 InstructionCost ExtCost = thisT()->getCastInstrCost( 2187 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2188 TTI::CastContextHint::None, CostKind); 2189 if (IsMLA) { 2190 MulCost = 2191 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2192 ExtCost *= 2; 2193 } 2194 2195 return RedCost + MulCost + ExtCost; 2196 } 2197 getVectorSplitCost()2198 InstructionCost getVectorSplitCost() { return 1; } 2199 2200 /// @} 2201 }; 2202 2203 /// Concrete BasicTTIImpl that can be used if no further customization 2204 /// is needed. 2205 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2206 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2207 2208 friend class BasicTTIImplBase<BasicTTIImpl>; 2209 2210 const TargetSubtargetInfo *ST; 2211 const TargetLoweringBase *TLI; 2212 getST()2213 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2214 const TargetLoweringBase *getTLI() const { return TLI; } 2215 2216 public: 2217 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2218 }; 2219 2220 } // end namespace llvm 2221 2222 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2223