1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/TargetTransformInfo.h" 26 #include "llvm/Analysis/TargetTransformInfoImpl.h" 27 #include "llvm/CodeGen/ISDOpcodes.h" 28 #include "llvm/CodeGen/TargetLowering.h" 29 #include "llvm/CodeGen/TargetSubtargetInfo.h" 30 #include "llvm/CodeGen/ValueTypes.h" 31 #include "llvm/IR/BasicBlock.h" 32 #include "llvm/IR/Constant.h" 33 #include "llvm/IR/Constants.h" 34 #include "llvm/IR/DataLayout.h" 35 #include "llvm/IR/DerivedTypes.h" 36 #include "llvm/IR/InstrTypes.h" 37 #include "llvm/IR/Instruction.h" 38 #include "llvm/IR/Instructions.h" 39 #include "llvm/IR/Intrinsics.h" 40 #include "llvm/IR/Operator.h" 41 #include "llvm/IR/Type.h" 42 #include "llvm/IR/Value.h" 43 #include "llvm/Support/Casting.h" 44 #include "llvm/Support/CommandLine.h" 45 #include "llvm/Support/ErrorHandling.h" 46 #include "llvm/Support/MachineValueType.h" 47 #include "llvm/Support/MathExtras.h" 48 #include "llvm/Target/TargetMachine.h" 49 #include <algorithm> 50 #include <cassert> 51 #include <cstdint> 52 #include <limits> 53 #include <utility> 54 55 namespace llvm { 56 57 class Function; 58 class GlobalValue; 59 class LLVMContext; 60 class ScalarEvolution; 61 class SCEV; 62 class TargetMachine; 63 64 extern cl::opt<unsigned> PartialUnrollingThreshold; 65 66 /// Base class which can be used to help build a TTI implementation. 67 /// 68 /// This class provides as much implementation of the TTI interface as is 69 /// possible using the target independent parts of the code generator. 70 /// 71 /// In order to subclass it, your class must implement a getST() method to 72 /// return the subtarget, and a getTLI() method to return the target lowering. 73 /// We need these methods implemented in the derived class so that this class 74 /// doesn't have to duplicate storage for them. 75 template <typename T> 76 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 77 private: 78 using BaseT = TargetTransformInfoImplCRTPBase<T>; 79 using TTI = TargetTransformInfo; 80 81 /// Helper function to access this as a T. thisT()82 T *thisT() { return static_cast<T *>(this); } 83 84 /// Estimate a cost of Broadcast as an extract and sequence of insert 85 /// operations. getBroadcastShuffleOverhead(FixedVectorType * VTy)86 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { 87 InstructionCost Cost = 0; 88 // Broadcast cost is equal to the cost of extracting the zero'th element 89 // plus the cost of inserting it into every element of the result vector. 90 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); 91 92 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 93 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 94 } 95 return Cost; 96 } 97 98 /// Estimate a cost of shuffle as a sequence of extract and insert 99 /// operations. getPermuteShuffleOverhead(FixedVectorType * VTy)100 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { 101 InstructionCost Cost = 0; 102 // Shuffle cost is equal to the cost of extracting element from its argument 103 // plus the cost of inserting them onto the result vector. 104 105 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 106 // index 0 of first vector, index 1 of second vector,index 2 of first 107 // vector and finally index 3 of second vector and insert them at index 108 // <0,1,2,3> of result vector. 109 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 110 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 111 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); 112 } 113 return Cost; 114 } 115 116 /// Estimate a cost of subvector extraction as a sequence of extract and 117 /// insert operations. getExtractSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)118 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, 119 FixedVectorType *SubVTy) { 120 assert(VTy && SubVTy && 121 "Can only extract subvectors from vectors"); 122 int NumSubElts = SubVTy->getNumElements(); 123 assert((!isa<FixedVectorType>(VTy) || 124 (Index + NumSubElts) <= 125 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 126 "SK_ExtractSubvector index out of range"); 127 128 InstructionCost Cost = 0; 129 // Subvector extraction cost is equal to the cost of extracting element from 130 // the source type plus the cost of inserting them into the result vector 131 // type. 132 for (int i = 0; i != NumSubElts; ++i) { 133 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 134 i + Index); 135 Cost += 136 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); 137 } 138 return Cost; 139 } 140 141 /// Estimate a cost of subvector insertion as a sequence of extract and 142 /// insert operations. getInsertSubvectorOverhead(VectorType * VTy,int Index,FixedVectorType * SubVTy)143 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, 144 FixedVectorType *SubVTy) { 145 assert(VTy && SubVTy && 146 "Can only insert subvectors into vectors"); 147 int NumSubElts = SubVTy->getNumElements(); 148 assert((!isa<FixedVectorType>(VTy) || 149 (Index + NumSubElts) <= 150 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 151 "SK_InsertSubvector index out of range"); 152 153 InstructionCost Cost = 0; 154 // Subvector insertion cost is equal to the cost of extracting element from 155 // the source type plus the cost of inserting them into the result vector 156 // type. 157 for (int i = 0; i != NumSubElts; ++i) { 158 Cost += 159 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); 160 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 161 i + Index); 162 } 163 return Cost; 164 } 165 166 /// Local query method delegates up to T which *must* implement this! getST()167 const TargetSubtargetInfo *getST() const { 168 return static_cast<const T *>(this)->getST(); 169 } 170 171 /// Local query method delegates up to T which *must* implement this! getTLI()172 const TargetLoweringBase *getTLI() const { 173 return static_cast<const T *>(this)->getTLI(); 174 } 175 getISDIndexedMode(TTI::MemIndexedMode M)176 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 177 switch (M) { 178 case TTI::MIM_Unindexed: 179 return ISD::UNINDEXED; 180 case TTI::MIM_PreInc: 181 return ISD::PRE_INC; 182 case TTI::MIM_PreDec: 183 return ISD::PRE_DEC; 184 case TTI::MIM_PostInc: 185 return ISD::POST_INC; 186 case TTI::MIM_PostDec: 187 return ISD::POST_DEC; 188 } 189 llvm_unreachable("Unexpected MemIndexedMode"); 190 } 191 getCommonMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,bool VariableMask,bool IsGatherScatter,TTI::TargetCostKind CostKind)192 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 193 Align Alignment, 194 bool VariableMask, 195 bool IsGatherScatter, 196 TTI::TargetCostKind CostKind) { 197 auto *VT = cast<FixedVectorType>(DataTy); 198 // Assume the target does not have support for gather/scatter operations 199 // and provide a rough estimate. 200 // 201 // First, compute the cost of the individual memory operations. 202 InstructionCost AddrExtractCost = 203 IsGatherScatter 204 ? getVectorInstrCost(Instruction::ExtractElement, 205 FixedVectorType::get( 206 PointerType::get(VT->getElementType(), 0), 207 VT->getNumElements()), 208 -1) 209 : 0; 210 InstructionCost LoadCost = 211 VT->getNumElements() * 212 (AddrExtractCost + 213 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 214 215 // Next, compute the cost of packing the result in a vector. 216 InstructionCost PackingCost = getScalarizationOverhead( 217 VT, Opcode != Instruction::Store, Opcode == Instruction::Store); 218 219 InstructionCost ConditionalCost = 0; 220 if (VariableMask) { 221 // Compute the cost of conditionally executing the memory operations with 222 // variable masks. This includes extracting the individual conditions, a 223 // branches and PHIs to combine the results. 224 // NOTE: Estimating the cost of conditionally executing the memory 225 // operations accurately is quite difficult and the current solution 226 // provides a very rough estimate only. 227 ConditionalCost = 228 VT->getNumElements() * 229 (getVectorInstrCost( 230 Instruction::ExtractElement, 231 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 232 VT->getNumElements()), 233 -1) + 234 getCFInstrCost(Instruction::Br, CostKind) + 235 getCFInstrCost(Instruction::PHI, CostKind)); 236 } 237 238 return LoadCost + PackingCost + ConditionalCost; 239 } 240 241 protected: BasicTTIImplBase(const TargetMachine * TM,const DataLayout & DL)242 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 243 : BaseT(DL) {} 244 virtual ~BasicTTIImplBase() = default; 245 246 using TargetTransformInfoImplBase::DL; 247 248 public: 249 /// \name Scalar TTI Implementations 250 /// @{ allowsMisalignedMemoryAccesses(LLVMContext & Context,unsigned BitWidth,unsigned AddressSpace,Align Alignment,bool * Fast)251 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 252 unsigned AddressSpace, Align Alignment, 253 bool *Fast) const { 254 EVT E = EVT::getIntegerVT(Context, BitWidth); 255 return getTLI()->allowsMisalignedMemoryAccesses( 256 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 257 } 258 hasBranchDivergence()259 bool hasBranchDivergence() { return false; } 260 useGPUDivergenceAnalysis()261 bool useGPUDivergenceAnalysis() { return false; } 262 isSourceOfDivergence(const Value * V)263 bool isSourceOfDivergence(const Value *V) { return false; } 264 isAlwaysUniform(const Value * V)265 bool isAlwaysUniform(const Value *V) { return false; } 266 getFlatAddressSpace()267 unsigned getFlatAddressSpace() { 268 // Return an invalid address space. 269 return -1; 270 } 271 collectFlatAddressOperands(SmallVectorImpl<int> & OpIndexes,Intrinsic::ID IID)272 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 273 Intrinsic::ID IID) const { 274 return false; 275 } 276 isNoopAddrSpaceCast(unsigned FromAS,unsigned ToAS)277 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 278 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 279 } 280 getAssumedAddrSpace(const Value * V)281 unsigned getAssumedAddrSpace(const Value *V) const { 282 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 283 } 284 rewriteIntrinsicWithAddressSpace(IntrinsicInst * II,Value * OldV,Value * NewV)285 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 286 Value *NewV) const { 287 return nullptr; 288 } 289 isLegalAddImmediate(int64_t imm)290 bool isLegalAddImmediate(int64_t imm) { 291 return getTLI()->isLegalAddImmediate(imm); 292 } 293 isLegalICmpImmediate(int64_t imm)294 bool isLegalICmpImmediate(int64_t imm) { 295 return getTLI()->isLegalICmpImmediate(imm); 296 } 297 298 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 299 bool HasBaseReg, int64_t Scale, 300 unsigned AddrSpace, Instruction *I = nullptr) { 301 TargetLoweringBase::AddrMode AM; 302 AM.BaseGV = BaseGV; 303 AM.BaseOffs = BaseOffset; 304 AM.HasBaseReg = HasBaseReg; 305 AM.Scale = Scale; 306 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 307 } 308 isIndexedLoadLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)309 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 310 const DataLayout &DL) const { 311 EVT VT = getTLI()->getValueType(DL, Ty); 312 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 313 } 314 isIndexedStoreLegal(TTI::MemIndexedMode M,Type * Ty,const DataLayout & DL)315 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 316 const DataLayout &DL) const { 317 EVT VT = getTLI()->getValueType(DL, Ty); 318 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 319 } 320 isLSRCostLess(TTI::LSRCost C1,TTI::LSRCost C2)321 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 322 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 323 } 324 isNumRegsMajorCostOfLSR()325 bool isNumRegsMajorCostOfLSR() { 326 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 327 } 328 isProfitableLSRChainElement(Instruction * I)329 bool isProfitableLSRChainElement(Instruction *I) { 330 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 331 } 332 getScalingFactorCost(Type * Ty,GlobalValue * BaseGV,int64_t BaseOffset,bool HasBaseReg,int64_t Scale,unsigned AddrSpace)333 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 334 int64_t BaseOffset, bool HasBaseReg, 335 int64_t Scale, unsigned AddrSpace) { 336 TargetLoweringBase::AddrMode AM; 337 AM.BaseGV = BaseGV; 338 AM.BaseOffs = BaseOffset; 339 AM.HasBaseReg = HasBaseReg; 340 AM.Scale = Scale; 341 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 342 } 343 isTruncateFree(Type * Ty1,Type * Ty2)344 bool isTruncateFree(Type *Ty1, Type *Ty2) { 345 return getTLI()->isTruncateFree(Ty1, Ty2); 346 } 347 isProfitableToHoist(Instruction * I)348 bool isProfitableToHoist(Instruction *I) { 349 return getTLI()->isProfitableToHoist(I); 350 } 351 useAA()352 bool useAA() const { return getST()->useAA(); } 353 isTypeLegal(Type * Ty)354 bool isTypeLegal(Type *Ty) { 355 EVT VT = getTLI()->getValueType(DL, Ty); 356 return getTLI()->isTypeLegal(VT); 357 } 358 getRegUsageForType(Type * Ty)359 InstructionCost getRegUsageForType(Type *Ty) { 360 InstructionCost Val = getTLI()->getTypeLegalizationCost(DL, Ty).first; 361 assert(Val >= 0 && "Negative cost!"); 362 return Val; 363 } 364 getGEPCost(Type * PointeeType,const Value * Ptr,ArrayRef<const Value * > Operands)365 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 366 ArrayRef<const Value *> Operands) { 367 return BaseT::getGEPCost(PointeeType, Ptr, Operands); 368 } 369 getEstimatedNumberOfCaseClusters(const SwitchInst & SI,unsigned & JumpTableSize,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)370 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 371 unsigned &JumpTableSize, 372 ProfileSummaryInfo *PSI, 373 BlockFrequencyInfo *BFI) { 374 /// Try to find the estimated number of clusters. Note that the number of 375 /// clusters identified in this function could be different from the actual 376 /// numbers found in lowering. This function ignore switches that are 377 /// lowered with a mix of jump table / bit test / BTree. This function was 378 /// initially intended to be used when estimating the cost of switch in 379 /// inline cost heuristic, but it's a generic cost model to be used in other 380 /// places (e.g., in loop unrolling). 381 unsigned N = SI.getNumCases(); 382 const TargetLoweringBase *TLI = getTLI(); 383 const DataLayout &DL = this->getDataLayout(); 384 385 JumpTableSize = 0; 386 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 387 388 // Early exit if both a jump table and bit test are not allowed. 389 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 390 return N; 391 392 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 393 APInt MinCaseVal = MaxCaseVal; 394 for (auto CI : SI.cases()) { 395 const APInt &CaseVal = CI.getCaseValue()->getValue(); 396 if (CaseVal.sgt(MaxCaseVal)) 397 MaxCaseVal = CaseVal; 398 if (CaseVal.slt(MinCaseVal)) 399 MinCaseVal = CaseVal; 400 } 401 402 // Check if suitable for a bit test 403 if (N <= DL.getIndexSizeInBits(0u)) { 404 SmallPtrSet<const BasicBlock *, 4> Dests; 405 for (auto I : SI.cases()) 406 Dests.insert(I.getCaseSuccessor()); 407 408 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 409 DL)) 410 return 1; 411 } 412 413 // Check if suitable for a jump table. 414 if (IsJTAllowed) { 415 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 416 return N; 417 uint64_t Range = 418 (MaxCaseVal - MinCaseVal) 419 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 420 // Check whether a range of clusters is dense enough for a jump table 421 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 422 JumpTableSize = Range; 423 return 1; 424 } 425 } 426 return N; 427 } 428 shouldBuildLookupTables()429 bool shouldBuildLookupTables() { 430 const TargetLoweringBase *TLI = getTLI(); 431 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 432 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 433 } 434 shouldBuildRelLookupTables()435 bool shouldBuildRelLookupTables() const { 436 const TargetMachine &TM = getTLI()->getTargetMachine(); 437 // If non-PIC mode, do not generate a relative lookup table. 438 if (!TM.isPositionIndependent()) 439 return false; 440 441 /// Relative lookup table entries consist of 32-bit offsets. 442 /// Do not generate relative lookup tables for large code models 443 /// in 64-bit achitectures where 32-bit offsets might not be enough. 444 if (TM.getCodeModel() == CodeModel::Medium || 445 TM.getCodeModel() == CodeModel::Large) 446 return false; 447 448 Triple TargetTriple = TM.getTargetTriple(); 449 if (!TargetTriple.isArch64Bit()) 450 return false; 451 452 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 453 // there. 454 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 455 return false; 456 457 return true; 458 } 459 haveFastSqrt(Type * Ty)460 bool haveFastSqrt(Type *Ty) { 461 const TargetLoweringBase *TLI = getTLI(); 462 EVT VT = TLI->getValueType(DL, Ty); 463 return TLI->isTypeLegal(VT) && 464 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 465 } 466 isFCmpOrdCheaperThanFCmpZero(Type * Ty)467 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 468 return true; 469 } 470 getFPOpCost(Type * Ty)471 InstructionCost getFPOpCost(Type *Ty) { 472 // Check whether FADD is available, as a proxy for floating-point in 473 // general. 474 const TargetLoweringBase *TLI = getTLI(); 475 EVT VT = TLI->getValueType(DL, Ty); 476 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 477 return TargetTransformInfo::TCC_Basic; 478 return TargetTransformInfo::TCC_Expensive; 479 } 480 getInliningThresholdMultiplier()481 unsigned getInliningThresholdMultiplier() { return 1; } adjustInliningThreshold(const CallBase * CB)482 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 483 getInlinerVectorBonusPercent()484 int getInlinerVectorBonusPercent() { return 150; } 485 getUnrollingPreferences(Loop * L,ScalarEvolution & SE,TTI::UnrollingPreferences & UP)486 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 487 TTI::UnrollingPreferences &UP) { 488 // This unrolling functionality is target independent, but to provide some 489 // motivation for its intended use, for x86: 490 491 // According to the Intel 64 and IA-32 Architectures Optimization Reference 492 // Manual, Intel Core models and later have a loop stream detector (and 493 // associated uop queue) that can benefit from partial unrolling. 494 // The relevant requirements are: 495 // - The loop must have no more than 4 (8 for Nehalem and later) branches 496 // taken, and none of them may be calls. 497 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 498 499 // According to the Software Optimization Guide for AMD Family 15h 500 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 501 // and loop buffer which can benefit from partial unrolling. 502 // The relevant requirements are: 503 // - The loop must have fewer than 16 branches 504 // - The loop must have less than 40 uops in all executed loop branches 505 506 // The number of taken branches in a loop is hard to estimate here, and 507 // benchmarking has revealed that it is better not to be conservative when 508 // estimating the branch count. As a result, we'll ignore the branch limits 509 // until someone finds a case where it matters in practice. 510 511 unsigned MaxOps; 512 const TargetSubtargetInfo *ST = getST(); 513 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 514 MaxOps = PartialUnrollingThreshold; 515 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 516 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 517 else 518 return; 519 520 // Scan the loop: don't unroll loops with calls. 521 for (BasicBlock *BB : L->blocks()) { 522 for (Instruction &I : *BB) { 523 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 524 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 525 if (!thisT()->isLoweredToCall(F)) 526 continue; 527 } 528 529 return; 530 } 531 } 532 } 533 534 // Enable runtime and partial unrolling up to the specified size. 535 // Enable using trip count upper bound to unroll loops. 536 UP.Partial = UP.Runtime = UP.UpperBound = true; 537 UP.PartialThreshold = MaxOps; 538 539 // Avoid unrolling when optimizing for size. 540 UP.OptSizeThreshold = 0; 541 UP.PartialOptSizeThreshold = 0; 542 543 // Set number of instructions optimized when "back edge" 544 // becomes "fall through" to default value of 2. 545 UP.BEInsns = 2; 546 } 547 getPeelingPreferences(Loop * L,ScalarEvolution & SE,TTI::PeelingPreferences & PP)548 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 549 TTI::PeelingPreferences &PP) { 550 PP.PeelCount = 0; 551 PP.AllowPeeling = true; 552 PP.AllowLoopNestsPeeling = false; 553 PP.PeelProfiledIterations = true; 554 } 555 isHardwareLoopProfitable(Loop * L,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * LibInfo,HardwareLoopInfo & HWLoopInfo)556 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 557 AssumptionCache &AC, 558 TargetLibraryInfo *LibInfo, 559 HardwareLoopInfo &HWLoopInfo) { 560 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 561 } 562 preferPredicateOverEpilogue(Loop * L,LoopInfo * LI,ScalarEvolution & SE,AssumptionCache & AC,TargetLibraryInfo * TLI,DominatorTree * DT,const LoopAccessInfo * LAI)563 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 564 AssumptionCache &AC, TargetLibraryInfo *TLI, 565 DominatorTree *DT, 566 const LoopAccessInfo *LAI) { 567 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI); 568 } 569 emitGetActiveLaneMask()570 bool emitGetActiveLaneMask() { 571 return BaseT::emitGetActiveLaneMask(); 572 } 573 instCombineIntrinsic(InstCombiner & IC,IntrinsicInst & II)574 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 575 IntrinsicInst &II) { 576 return BaseT::instCombineIntrinsic(IC, II); 577 } 578 simplifyDemandedUseBitsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedMask,KnownBits & Known,bool & KnownBitsComputed)579 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, 580 IntrinsicInst &II, 581 APInt DemandedMask, 582 KnownBits &Known, 583 bool &KnownBitsComputed) { 584 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 585 KnownBitsComputed); 586 } 587 simplifyDemandedVectorEltsIntrinsic(InstCombiner & IC,IntrinsicInst & II,APInt DemandedElts,APInt & UndefElts,APInt & UndefElts2,APInt & UndefElts3,std::function<void (Instruction *,unsigned,APInt,APInt &)> SimplifyAndSetOp)588 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 589 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 590 APInt &UndefElts2, APInt &UndefElts3, 591 std::function<void(Instruction *, unsigned, APInt, APInt &)> 592 SimplifyAndSetOp) { 593 return BaseT::simplifyDemandedVectorEltsIntrinsic( 594 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 595 SimplifyAndSetOp); 596 } 597 getInstructionLatency(const Instruction * I)598 InstructionCost getInstructionLatency(const Instruction *I) { 599 if (isa<LoadInst>(I)) 600 return getST()->getSchedModel().DefaultLoadLatency; 601 602 return BaseT::getInstructionLatency(I); 603 } 604 605 virtual Optional<unsigned> getCacheSize(TargetTransformInfo::CacheLevel Level)606 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 607 return Optional<unsigned>( 608 getST()->getCacheSize(static_cast<unsigned>(Level))); 609 } 610 611 virtual Optional<unsigned> getCacheAssociativity(TargetTransformInfo::CacheLevel Level)612 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 613 Optional<unsigned> TargetResult = 614 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 615 616 if (TargetResult) 617 return TargetResult; 618 619 return BaseT::getCacheAssociativity(Level); 620 } 621 getCacheLineSize()622 virtual unsigned getCacheLineSize() const { 623 return getST()->getCacheLineSize(); 624 } 625 getPrefetchDistance()626 virtual unsigned getPrefetchDistance() const { 627 return getST()->getPrefetchDistance(); 628 } 629 getMinPrefetchStride(unsigned NumMemAccesses,unsigned NumStridedMemAccesses,unsigned NumPrefetches,bool HasCall)630 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 631 unsigned NumStridedMemAccesses, 632 unsigned NumPrefetches, 633 bool HasCall) const { 634 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 635 NumPrefetches, HasCall); 636 } 637 getMaxPrefetchIterationsAhead()638 virtual unsigned getMaxPrefetchIterationsAhead() const { 639 return getST()->getMaxPrefetchIterationsAhead(); 640 } 641 enableWritePrefetching()642 virtual bool enableWritePrefetching() const { 643 return getST()->enableWritePrefetching(); 644 } 645 646 /// @} 647 648 /// \name Vector TTI Implementations 649 /// @{ 650 getRegisterBitWidth(TargetTransformInfo::RegisterKind K)651 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 652 return TypeSize::getFixed(32); 653 } 654 getMaxVScale()655 Optional<unsigned> getMaxVScale() const { return None; } 656 657 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 658 /// are set if the demanded result elements need to be inserted and/or 659 /// extracted from vectors. getScalarizationOverhead(VectorType * InTy,const APInt & DemandedElts,bool Insert,bool Extract)660 InstructionCost getScalarizationOverhead(VectorType *InTy, 661 const APInt &DemandedElts, 662 bool Insert, bool Extract) { 663 /// FIXME: a bitfield is not a reasonable abstraction for talking about 664 /// which elements are needed from a scalable vector 665 auto *Ty = cast<FixedVectorType>(InTy); 666 667 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 668 "Vector size mismatch"); 669 670 InstructionCost Cost = 0; 671 672 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 673 if (!DemandedElts[i]) 674 continue; 675 if (Insert) 676 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); 677 if (Extract) 678 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 679 } 680 681 return Cost; 682 } 683 684 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. getScalarizationOverhead(VectorType * InTy,bool Insert,bool Extract)685 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 686 bool Extract) { 687 auto *Ty = cast<FixedVectorType>(InTy); 688 689 APInt DemandedElts = APInt::getAllOnesValue(Ty->getNumElements()); 690 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 691 } 692 693 /// Estimate the overhead of scalarizing an instructions unique 694 /// non-constant operands. The (potentially vector) types to use for each of 695 /// argument are passes via Tys. getOperandsScalarizationOverhead(ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)696 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 697 ArrayRef<Type *> Tys) { 698 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 699 700 InstructionCost Cost = 0; 701 SmallPtrSet<const Value*, 4> UniqueOperands; 702 for (int I = 0, E = Args.size(); I != E; I++) { 703 // Disregard things like metadata arguments. 704 const Value *A = Args[I]; 705 Type *Ty = Tys[I]; 706 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 707 !Ty->isPtrOrPtrVectorTy()) 708 continue; 709 710 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 711 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 712 Cost += getScalarizationOverhead(VecTy, false, true); 713 } 714 } 715 716 return Cost; 717 } 718 719 /// Estimate the overhead of scalarizing the inputs and outputs of an 720 /// instruction, with return type RetTy and arguments Args of type Tys. If 721 /// Args are unknown (empty), then the cost associated with one argument is 722 /// added as a heuristic. getScalarizationOverhead(VectorType * RetTy,ArrayRef<const Value * > Args,ArrayRef<Type * > Tys)723 InstructionCost getScalarizationOverhead(VectorType *RetTy, 724 ArrayRef<const Value *> Args, 725 ArrayRef<Type *> Tys) { 726 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); 727 if (!Args.empty()) 728 Cost += getOperandsScalarizationOverhead(Args, Tys); 729 else 730 // When no information on arguments is provided, we add the cost 731 // associated with one argument as a heuristic. 732 Cost += getScalarizationOverhead(RetTy, false, true); 733 734 return Cost; 735 } 736 getMaxInterleaveFactor(unsigned VF)737 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 738 739 InstructionCost getArithmeticInstrCost( 740 unsigned Opcode, Type *Ty, 741 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput, 742 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 743 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 744 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 745 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 746 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 747 const Instruction *CxtI = nullptr) { 748 // Check if any of the operands are vector operands. 749 const TargetLoweringBase *TLI = getTLI(); 750 int ISD = TLI->InstructionOpcodeToISD(Opcode); 751 assert(ISD && "Invalid opcode"); 752 753 // TODO: Handle more cost kinds. 754 if (CostKind != TTI::TCK_RecipThroughput) 755 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 756 Opd1Info, Opd2Info, 757 Opd1PropInfo, Opd2PropInfo, 758 Args, CxtI); 759 760 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 761 762 bool IsFloat = Ty->isFPOrFPVectorTy(); 763 // Assume that floating point arithmetic operations cost twice as much as 764 // integer operations. 765 InstructionCost OpCost = (IsFloat ? 2 : 1); 766 767 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 768 // The operation is legal. Assume it costs 1. 769 // TODO: Once we have extract/insert subvector cost we need to use them. 770 return LT.first * OpCost; 771 } 772 773 if (!TLI->isOperationExpand(ISD, LT.second)) { 774 // If the operation is custom lowered, then assume that the code is twice 775 // as expensive. 776 return LT.first * 2 * OpCost; 777 } 778 779 // An 'Expand' of URem and SRem is special because it may default 780 // to expanding the operation into a sequence of sub-operations 781 // i.e. X % Y -> X-(X/Y)*Y. 782 if (ISD == ISD::UREM || ISD == ISD::SREM) { 783 bool IsSigned = ISD == ISD::SREM; 784 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 785 LT.second) || 786 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 787 LT.second)) { 788 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 789 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 790 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, 791 Opd2PropInfo); 792 InstructionCost MulCost = 793 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 794 InstructionCost SubCost = 795 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 796 return DivCost + MulCost + SubCost; 797 } 798 } 799 800 // We cannot scalarize scalable vectors, so return Invalid. 801 if (isa<ScalableVectorType>(Ty)) 802 return InstructionCost::getInvalid(); 803 804 // Else, assume that we need to scalarize this op. 805 // TODO: If one of the types get legalized by splitting, handle this 806 // similarly to what getCastInstrCost() does. 807 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 808 InstructionCost Cost = thisT()->getArithmeticInstrCost( 809 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 810 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 811 // Return the cost of multiple scalar invocation plus the cost of 812 // inserting and extracting the values. 813 SmallVector<Type *> Tys(Args.size(), Ty); 814 return getScalarizationOverhead(VTy, Args, Tys) + 815 VTy->getNumElements() * Cost; 816 } 817 818 // We don't know anything about this scalar instruction. 819 return OpCost; 820 } 821 improveShuffleKindFromMask(TTI::ShuffleKind Kind,ArrayRef<int> Mask)822 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 823 ArrayRef<int> Mask) const { 824 int Limit = Mask.size() * 2; 825 if (Mask.empty() || 826 // Extra check required by isSingleSourceMaskImpl function (called by 827 // ShuffleVectorInst::isSingleSourceMask). 828 any_of(Mask, [Limit](int I) { return I >= Limit; })) 829 return Kind; 830 switch (Kind) { 831 case TTI::SK_PermuteSingleSrc: 832 if (ShuffleVectorInst::isReverseMask(Mask)) 833 return TTI::SK_Reverse; 834 if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) 835 return TTI::SK_Broadcast; 836 break; 837 case TTI::SK_PermuteTwoSrc: 838 if (ShuffleVectorInst::isSelectMask(Mask)) 839 return TTI::SK_Select; 840 if (ShuffleVectorInst::isTransposeMask(Mask)) 841 return TTI::SK_Transpose; 842 break; 843 case TTI::SK_Select: 844 case TTI::SK_Reverse: 845 case TTI::SK_Broadcast: 846 case TTI::SK_Transpose: 847 case TTI::SK_InsertSubvector: 848 case TTI::SK_ExtractSubvector: 849 case TTI::SK_Splice: 850 break; 851 } 852 return Kind; 853 } 854 getShuffleCost(TTI::ShuffleKind Kind,VectorType * Tp,ArrayRef<int> Mask,int Index,VectorType * SubTp)855 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 856 ArrayRef<int> Mask, int Index, 857 VectorType *SubTp) { 858 859 switch (improveShuffleKindFromMask(Kind, Mask)) { 860 case TTI::SK_Broadcast: 861 return getBroadcastShuffleOverhead(cast<FixedVectorType>(Tp)); 862 case TTI::SK_Select: 863 case TTI::SK_Splice: 864 case TTI::SK_Reverse: 865 case TTI::SK_Transpose: 866 case TTI::SK_PermuteSingleSrc: 867 case TTI::SK_PermuteTwoSrc: 868 return getPermuteShuffleOverhead(cast<FixedVectorType>(Tp)); 869 case TTI::SK_ExtractSubvector: 870 return getExtractSubvectorOverhead(Tp, Index, 871 cast<FixedVectorType>(SubTp)); 872 case TTI::SK_InsertSubvector: 873 return getInsertSubvectorOverhead(Tp, Index, 874 cast<FixedVectorType>(SubTp)); 875 } 876 llvm_unreachable("Unknown TTI::ShuffleKind"); 877 } 878 879 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 880 TTI::CastContextHint CCH, 881 TTI::TargetCostKind CostKind, 882 const Instruction *I = nullptr) { 883 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 884 return 0; 885 886 const TargetLoweringBase *TLI = getTLI(); 887 int ISD = TLI->InstructionOpcodeToISD(Opcode); 888 assert(ISD && "Invalid opcode"); 889 std::pair<InstructionCost, MVT> SrcLT = 890 TLI->getTypeLegalizationCost(DL, Src); 891 std::pair<InstructionCost, MVT> DstLT = 892 TLI->getTypeLegalizationCost(DL, Dst); 893 894 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 895 TypeSize DstSize = DstLT.second.getSizeInBits(); 896 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 897 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 898 899 switch (Opcode) { 900 default: 901 break; 902 case Instruction::Trunc: 903 // Check for NOOP conversions. 904 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 905 return 0; 906 LLVM_FALLTHROUGH; 907 case Instruction::BitCast: 908 // Bitcast between types that are legalized to the same type are free and 909 // assume int to/from ptr of the same size is also free. 910 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 911 SrcSize == DstSize) 912 return 0; 913 break; 914 case Instruction::FPExt: 915 if (I && getTLI()->isExtFree(I)) 916 return 0; 917 break; 918 case Instruction::ZExt: 919 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 920 return 0; 921 LLVM_FALLTHROUGH; 922 case Instruction::SExt: 923 if (I && getTLI()->isExtFree(I)) 924 return 0; 925 926 // If this is a zext/sext of a load, return 0 if the corresponding 927 // extending load exists on target and the result type is legal. 928 if (CCH == TTI::CastContextHint::Normal) { 929 EVT ExtVT = EVT::getEVT(Dst); 930 EVT LoadVT = EVT::getEVT(Src); 931 unsigned LType = 932 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 933 if (DstLT.first == SrcLT.first && 934 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 935 return 0; 936 } 937 break; 938 case Instruction::AddrSpaceCast: 939 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 940 Dst->getPointerAddressSpace())) 941 return 0; 942 break; 943 } 944 945 auto *SrcVTy = dyn_cast<VectorType>(Src); 946 auto *DstVTy = dyn_cast<VectorType>(Dst); 947 948 // If the cast is marked as legal (or promote) then assume low cost. 949 if (SrcLT.first == DstLT.first && 950 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 951 return SrcLT.first; 952 953 // Handle scalar conversions. 954 if (!SrcVTy && !DstVTy) { 955 // Just check the op cost. If the operation is legal then assume it costs 956 // 1. 957 if (!TLI->isOperationExpand(ISD, DstLT.second)) 958 return 1; 959 960 // Assume that illegal scalar instruction are expensive. 961 return 4; 962 } 963 964 // Check vector-to-vector casts. 965 if (DstVTy && SrcVTy) { 966 // If the cast is between same-sized registers, then the check is simple. 967 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 968 969 // Assume that Zext is done using AND. 970 if (Opcode == Instruction::ZExt) 971 return SrcLT.first; 972 973 // Assume that sext is done using SHL and SRA. 974 if (Opcode == Instruction::SExt) 975 return SrcLT.first * 2; 976 977 // Just check the op cost. If the operation is legal then assume it 978 // costs 979 // 1 and multiply by the type-legalization overhead. 980 if (!TLI->isOperationExpand(ISD, DstLT.second)) 981 return SrcLT.first * 1; 982 } 983 984 // If we are legalizing by splitting, query the concrete TTI for the cost 985 // of casting the original vector twice. We also need to factor in the 986 // cost of the split itself. Count that as 1, to be consistent with 987 // TLI->getTypeLegalizationCost(). 988 bool SplitSrc = 989 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 990 TargetLowering::TypeSplitVector; 991 bool SplitDst = 992 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 993 TargetLowering::TypeSplitVector; 994 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 995 DstVTy->getElementCount().isVector()) { 996 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 997 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 998 T *TTI = static_cast<T *>(this); 999 // If both types need to be split then the split is free. 1000 InstructionCost SplitCost = 1001 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1002 return SplitCost + 1003 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1004 CostKind, I)); 1005 } 1006 1007 // Scalarization cost is Invalid, can't assume any num elements. 1008 if (isa<ScalableVectorType>(DstVTy)) 1009 return InstructionCost::getInvalid(); 1010 1011 // In other cases where the source or destination are illegal, assume 1012 // the operation will get scalarized. 1013 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1014 InstructionCost Cost = thisT()->getCastInstrCost( 1015 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1016 1017 // Return the cost of multiple scalar invocation plus the cost of 1018 // inserting and extracting the values. 1019 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; 1020 } 1021 1022 // We already handled vector-to-vector and scalar-to-scalar conversions. 1023 // This 1024 // is where we handle bitcast between vectors and scalars. We need to assume 1025 // that the conversion is scalarized in one way or another. 1026 if (Opcode == Instruction::BitCast) { 1027 // Illegal bitcasts are done by storing and loading from a stack slot. 1028 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + 1029 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); 1030 } 1031 1032 llvm_unreachable("Unhandled cast"); 1033 } 1034 getExtractWithExtendCost(unsigned Opcode,Type * Dst,VectorType * VecTy,unsigned Index)1035 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1036 VectorType *VecTy, unsigned Index) { 1037 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1038 Index) + 1039 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1040 TTI::CastContextHint::None, 1041 TTI::TCK_RecipThroughput); 1042 } 1043 1044 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1045 const Instruction *I = nullptr) { 1046 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1047 } 1048 1049 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1050 CmpInst::Predicate VecPred, 1051 TTI::TargetCostKind CostKind, 1052 const Instruction *I = nullptr) { 1053 const TargetLoweringBase *TLI = getTLI(); 1054 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1055 assert(ISD && "Invalid opcode"); 1056 1057 // TODO: Handle other cost kinds. 1058 if (CostKind != TTI::TCK_RecipThroughput) 1059 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1060 I); 1061 1062 // Selects on vectors are actually vector selects. 1063 if (ISD == ISD::SELECT) { 1064 assert(CondTy && "CondTy must exist"); 1065 if (CondTy->isVectorTy()) 1066 ISD = ISD::VSELECT; 1067 } 1068 std::pair<InstructionCost, MVT> LT = 1069 TLI->getTypeLegalizationCost(DL, ValTy); 1070 1071 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1072 !TLI->isOperationExpand(ISD, LT.second)) { 1073 // The operation is legal. Assume it costs 1. Multiply 1074 // by the type-legalization overhead. 1075 return LT.first * 1; 1076 } 1077 1078 // Otherwise, assume that the cast is scalarized. 1079 // TODO: If one of the types get legalized by splitting, handle this 1080 // similarly to what getCastInstrCost() does. 1081 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1082 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1083 if (CondTy) 1084 CondTy = CondTy->getScalarType(); 1085 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1086 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1087 1088 // Return the cost of multiple scalar invocation plus the cost of 1089 // inserting and extracting the values. 1090 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; 1091 } 1092 1093 // Unknown scalar opcode. 1094 return 1; 1095 } 1096 getVectorInstrCost(unsigned Opcode,Type * Val,unsigned Index)1097 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1098 unsigned Index) { 1099 std::pair<InstructionCost, MVT> LT = 1100 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 1101 1102 return LT.first; 1103 } 1104 1105 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, 1106 MaybeAlign Alignment, unsigned AddressSpace, 1107 TTI::TargetCostKind CostKind, 1108 const Instruction *I = nullptr) { 1109 assert(!Src->isVoidTy() && "Invalid type"); 1110 // Assume types, such as structs, are expensive. 1111 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1112 return 4; 1113 std::pair<InstructionCost, MVT> LT = 1114 getTLI()->getTypeLegalizationCost(DL, Src); 1115 1116 // Assuming that all loads of legal types cost 1. 1117 InstructionCost Cost = LT.first; 1118 if (CostKind != TTI::TCK_RecipThroughput) 1119 return Cost; 1120 1121 if (Src->isVectorTy() && 1122 // In practice it's not currently possible to have a change in lane 1123 // length for extending loads or truncating stores so both types should 1124 // have the same scalable property. 1125 TypeSize::isKnownLT(Src->getPrimitiveSizeInBits(), 1126 LT.second.getSizeInBits())) { 1127 // This is a vector load that legalizes to a larger type than the vector 1128 // itself. Unless the corresponding extending load or truncating store is 1129 // legal, then this will scalarize. 1130 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1131 EVT MemVT = getTLI()->getValueType(DL, Src); 1132 if (Opcode == Instruction::Store) 1133 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1134 else 1135 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1136 1137 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1138 // This is a vector load/store for some illegal type that is scalarized. 1139 // We must account for the cost of building or decomposing the vector. 1140 Cost += getScalarizationOverhead(cast<VectorType>(Src), 1141 Opcode != Instruction::Store, 1142 Opcode == Instruction::Store); 1143 } 1144 } 1145 1146 return Cost; 1147 } 1148 getMaskedMemoryOpCost(unsigned Opcode,Type * DataTy,Align Alignment,unsigned AddressSpace,TTI::TargetCostKind CostKind)1149 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1150 Align Alignment, unsigned AddressSpace, 1151 TTI::TargetCostKind CostKind) { 1152 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1153 CostKind); 1154 } 1155 1156 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1157 const Value *Ptr, bool VariableMask, 1158 Align Alignment, 1159 TTI::TargetCostKind CostKind, 1160 const Instruction *I = nullptr) { 1161 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1162 true, CostKind); 1163 } 1164 1165 InstructionCost getInterleavedMemoryOpCost( 1166 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1167 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1168 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1169 auto *VT = cast<FixedVectorType>(VecTy); 1170 1171 unsigned NumElts = VT->getNumElements(); 1172 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1173 1174 unsigned NumSubElts = NumElts / Factor; 1175 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1176 1177 // Firstly, the cost of load/store operation. 1178 InstructionCost Cost; 1179 if (UseMaskForCond || UseMaskForGaps) 1180 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1181 AddressSpace, CostKind); 1182 else 1183 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1184 CostKind); 1185 1186 // Legalize the vector type, and get the legalized and unlegalized type 1187 // sizes. 1188 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 1189 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1190 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1191 1192 // Scale the cost of the memory operation by the fraction of legalized 1193 // instructions that will actually be used. We shouldn't account for the 1194 // cost of dead instructions since they will be removed. 1195 // 1196 // E.g., An interleaved load of factor 8: 1197 // %vec = load <16 x i64>, <16 x i64>* %ptr 1198 // %v0 = shufflevector %vec, undef, <0, 8> 1199 // 1200 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1201 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1202 // type). The other loads are unused. 1203 // 1204 // We only scale the cost of loads since interleaved store groups aren't 1205 // allowed to have gaps. 1206 if (Opcode == Instruction::Load && VecTySize > VecTyLTSize) { 1207 // The number of loads of a legal type it will take to represent a load 1208 // of the unlegalized vector type. 1209 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1210 1211 // The number of elements of the unlegalized type that correspond to a 1212 // single legal instruction. 1213 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1214 1215 // Determine which legal instructions will be used. 1216 BitVector UsedInsts(NumLegalInsts, false); 1217 for (unsigned Index : Indices) 1218 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1219 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1220 1221 // Scale the cost of the load by the fraction of legal instructions that 1222 // will be used. 1223 Cost *= UsedInsts.count() / NumLegalInsts; 1224 } 1225 1226 // Then plus the cost of interleave operation. 1227 if (Opcode == Instruction::Load) { 1228 // The interleave cost is similar to extract sub vectors' elements 1229 // from the wide vector, and insert them into sub vectors. 1230 // 1231 // E.g. An interleaved load of factor 2 (with one member of index 0): 1232 // %vec = load <8 x i32>, <8 x i32>* %ptr 1233 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1234 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1235 // <8 x i32> vector and insert them into a <4 x i32> vector. 1236 1237 assert(Indices.size() <= Factor && 1238 "Interleaved memory op has too many members"); 1239 1240 for (unsigned Index : Indices) { 1241 assert(Index < Factor && "Invalid index for interleaved memory op"); 1242 1243 // Extract elements from loaded vector for each sub vector. 1244 for (unsigned i = 0; i < NumSubElts; i++) 1245 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VT, 1246 Index + i * Factor); 1247 } 1248 1249 InstructionCost InsSubCost = 0; 1250 for (unsigned i = 0; i < NumSubElts; i++) 1251 InsSubCost += 1252 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVT, i); 1253 1254 Cost += Indices.size() * InsSubCost; 1255 } else { 1256 // The interleave cost is extract all elements from sub vectors, and 1257 // insert them into the wide vector. 1258 // 1259 // E.g. An interleaved store of factor 2: 1260 // %v0_v1 = shuffle %v0, %v1, <0, 4, 1, 5, 2, 6, 3, 7> 1261 // store <8 x i32> %interleaved.vec, <8 x i32>* %ptr 1262 // The cost is estimated as extract all elements from both <4 x i32> 1263 // vectors and insert into the <8 x i32> vector. 1264 1265 InstructionCost ExtSubCost = 0; 1266 for (unsigned i = 0; i < NumSubElts; i++) 1267 ExtSubCost += 1268 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1269 Cost += ExtSubCost * Factor; 1270 1271 for (unsigned i = 0; i < NumElts; i++) 1272 Cost += static_cast<T *>(this) 1273 ->getVectorInstrCost(Instruction::InsertElement, VT, i); 1274 } 1275 1276 if (!UseMaskForCond) 1277 return Cost; 1278 1279 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1280 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1281 SubVT = FixedVectorType::get(I8Type, NumSubElts); 1282 1283 // The Mask shuffling cost is extract all the elements of the Mask 1284 // and insert each of them Factor times into the wide vector: 1285 // 1286 // E.g. an interleaved group with factor 3: 1287 // %mask = icmp ult <8 x i32> %vec1, %vec2 1288 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1289 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1290 // The cost is estimated as extract all mask elements from the <8xi1> mask 1291 // vector and insert them factor times into the <24xi1> shuffled mask 1292 // vector. 1293 for (unsigned i = 0; i < NumSubElts; i++) 1294 Cost += 1295 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVT, i); 1296 1297 for (unsigned i = 0; i < NumElts; i++) 1298 Cost += 1299 thisT()->getVectorInstrCost(Instruction::InsertElement, MaskVT, i); 1300 1301 // The Gaps mask is invariant and created outside the loop, therefore the 1302 // cost of creating it is not accounted for here. However if we have both 1303 // a MaskForGaps and some other mask that guards the execution of the 1304 // memory access, we need to account for the cost of And-ing the two masks 1305 // inside the loop. 1306 if (UseMaskForGaps) 1307 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1308 CostKind); 1309 1310 return Cost; 1311 } 1312 1313 /// Get intrinsic cost based on arguments. getIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1314 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1315 TTI::TargetCostKind CostKind) { 1316 // Check for generically free intrinsics. 1317 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1318 return 0; 1319 1320 // Assume that target intrinsics are cheap. 1321 Intrinsic::ID IID = ICA.getID(); 1322 if (Function::isTargetIntrinsic(IID)) 1323 return TargetTransformInfo::TCC_Basic; 1324 1325 if (ICA.isTypeBasedOnly()) 1326 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1327 1328 Type *RetTy = ICA.getReturnType(); 1329 1330 ElementCount RetVF = 1331 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1332 : ElementCount::getFixed(1)); 1333 const IntrinsicInst *I = ICA.getInst(); 1334 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1335 FastMathFlags FMF = ICA.getFlags(); 1336 switch (IID) { 1337 default: 1338 break; 1339 1340 case Intrinsic::cttz: 1341 // FIXME: If necessary, this should go in target-specific overrides. 1342 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) 1343 return TargetTransformInfo::TCC_Basic; 1344 break; 1345 1346 case Intrinsic::ctlz: 1347 // FIXME: If necessary, this should go in target-specific overrides. 1348 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) 1349 return TargetTransformInfo::TCC_Basic; 1350 break; 1351 1352 case Intrinsic::memcpy: 1353 return thisT()->getMemcpyCost(ICA.getInst()); 1354 1355 case Intrinsic::masked_scatter: { 1356 const Value *Mask = Args[3]; 1357 bool VarMask = !isa<Constant>(Mask); 1358 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1359 return thisT()->getGatherScatterOpCost(Instruction::Store, 1360 ICA.getArgTypes()[0], Args[1], 1361 VarMask, Alignment, CostKind, I); 1362 } 1363 case Intrinsic::masked_gather: { 1364 const Value *Mask = Args[2]; 1365 bool VarMask = !isa<Constant>(Mask); 1366 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1367 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1368 VarMask, Alignment, CostKind, I); 1369 } 1370 case Intrinsic::experimental_stepvector: { 1371 if (isa<ScalableVectorType>(RetTy)) 1372 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1373 // The cost of materialising a constant integer vector. 1374 return TargetTransformInfo::TCC_Basic; 1375 } 1376 case Intrinsic::experimental_vector_extract: { 1377 // FIXME: Handle case where a scalable vector is extracted from a scalable 1378 // vector 1379 if (isa<ScalableVectorType>(RetTy)) 1380 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1381 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1382 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1383 cast<VectorType>(Args[0]->getType()), None, 1384 Index, cast<VectorType>(RetTy)); 1385 } 1386 case Intrinsic::experimental_vector_insert: { 1387 // FIXME: Handle case where a scalable vector is inserted into a scalable 1388 // vector 1389 if (isa<ScalableVectorType>(Args[1]->getType())) 1390 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1391 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1392 return thisT()->getShuffleCost( 1393 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, 1394 Index, cast<VectorType>(Args[1]->getType())); 1395 } 1396 case Intrinsic::experimental_vector_reverse: { 1397 return thisT()->getShuffleCost(TTI::SK_Reverse, 1398 cast<VectorType>(Args[0]->getType()), None, 1399 0, cast<VectorType>(RetTy)); 1400 } 1401 case Intrinsic::experimental_vector_splice: { 1402 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1403 return thisT()->getShuffleCost(TTI::SK_Splice, 1404 cast<VectorType>(Args[0]->getType()), None, 1405 Index, cast<VectorType>(RetTy)); 1406 } 1407 case Intrinsic::vector_reduce_add: 1408 case Intrinsic::vector_reduce_mul: 1409 case Intrinsic::vector_reduce_and: 1410 case Intrinsic::vector_reduce_or: 1411 case Intrinsic::vector_reduce_xor: 1412 case Intrinsic::vector_reduce_smax: 1413 case Intrinsic::vector_reduce_smin: 1414 case Intrinsic::vector_reduce_fmax: 1415 case Intrinsic::vector_reduce_fmin: 1416 case Intrinsic::vector_reduce_umax: 1417 case Intrinsic::vector_reduce_umin: { 1418 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1419 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1420 } 1421 case Intrinsic::vector_reduce_fadd: 1422 case Intrinsic::vector_reduce_fmul: { 1423 IntrinsicCostAttributes Attrs( 1424 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1425 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1426 } 1427 case Intrinsic::fshl: 1428 case Intrinsic::fshr: { 1429 if (isa<ScalableVectorType>(RetTy)) 1430 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1431 const Value *X = Args[0]; 1432 const Value *Y = Args[1]; 1433 const Value *Z = Args[2]; 1434 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1435 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1436 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1437 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1438 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1439 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1440 : TTI::OP_None; 1441 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1442 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1443 InstructionCost Cost = 0; 1444 Cost += 1445 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1446 Cost += 1447 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1448 Cost += thisT()->getArithmeticInstrCost( 1449 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); 1450 Cost += thisT()->getArithmeticInstrCost( 1451 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); 1452 // Non-constant shift amounts requires a modulo. 1453 if (OpKindZ != TTI::OK_UniformConstantValue && 1454 OpKindZ != TTI::OK_NonUniformConstantValue) 1455 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1456 CostKind, OpKindZ, OpKindBW, 1457 OpPropsZ, OpPropsBW); 1458 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1459 if (X != Y) { 1460 Type *CondTy = RetTy->getWithNewBitWidth(1); 1461 Cost += 1462 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1463 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1464 Cost += 1465 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1466 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1467 } 1468 return Cost; 1469 } 1470 } 1471 1472 // Assume that we need to scalarize this intrinsic. 1473 // Compute the scalarization overhead based on Args for a vector 1474 // intrinsic. 1475 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1476 if (RetVF.isVector() && !RetVF.isScalable()) { 1477 ScalarizationCost = 0; 1478 if (!RetTy->isVoidTy()) 1479 ScalarizationCost += 1480 getScalarizationOverhead(cast<VectorType>(RetTy), true, false); 1481 ScalarizationCost += 1482 getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); 1483 } 1484 1485 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1486 ScalarizationCost); 1487 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1488 } 1489 1490 /// Get intrinsic cost based on argument types. 1491 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1492 /// cost of scalarizing the arguments and the return value will be computed 1493 /// based on types. 1494 InstructionCost getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes & ICA,TTI::TargetCostKind CostKind)1495 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1496 TTI::TargetCostKind CostKind) { 1497 Intrinsic::ID IID = ICA.getID(); 1498 Type *RetTy = ICA.getReturnType(); 1499 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1500 FastMathFlags FMF = ICA.getFlags(); 1501 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1502 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1503 1504 VectorType *VecOpTy = nullptr; 1505 if (!Tys.empty()) { 1506 // The vector reduction operand is operand 0 except for fadd/fmul. 1507 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1508 unsigned VecTyIndex = 0; 1509 if (IID == Intrinsic::vector_reduce_fadd || 1510 IID == Intrinsic::vector_reduce_fmul) 1511 VecTyIndex = 1; 1512 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1513 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1514 } 1515 1516 // Library call cost - other than size, make it expensive. 1517 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1518 SmallVector<unsigned, 2> ISDs; 1519 switch (IID) { 1520 default: { 1521 // Scalable vectors cannot be scalarized, so return Invalid. 1522 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1523 return isa<ScalableVectorType>(Ty); 1524 })) 1525 return InstructionCost::getInvalid(); 1526 1527 // Assume that we need to scalarize this intrinsic. 1528 InstructionCost ScalarizationCost = 1529 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1530 unsigned ScalarCalls = 1; 1531 Type *ScalarRetTy = RetTy; 1532 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1533 if (!SkipScalarizationCost) 1534 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); 1535 ScalarCalls = std::max(ScalarCalls, 1536 cast<FixedVectorType>(RetVTy)->getNumElements()); 1537 ScalarRetTy = RetTy->getScalarType(); 1538 } 1539 SmallVector<Type *, 4> ScalarTys; 1540 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1541 Type *Ty = Tys[i]; 1542 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1543 if (!SkipScalarizationCost) 1544 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1545 ScalarCalls = std::max(ScalarCalls, 1546 cast<FixedVectorType>(VTy)->getNumElements()); 1547 Ty = Ty->getScalarType(); 1548 } 1549 ScalarTys.push_back(Ty); 1550 } 1551 if (ScalarCalls == 1) 1552 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1553 1554 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1555 InstructionCost ScalarCost = 1556 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1557 1558 return ScalarCalls * ScalarCost + ScalarizationCost; 1559 } 1560 // Look for intrinsics that can be lowered directly or turned into a scalar 1561 // intrinsic call. 1562 case Intrinsic::sqrt: 1563 ISDs.push_back(ISD::FSQRT); 1564 break; 1565 case Intrinsic::sin: 1566 ISDs.push_back(ISD::FSIN); 1567 break; 1568 case Intrinsic::cos: 1569 ISDs.push_back(ISD::FCOS); 1570 break; 1571 case Intrinsic::exp: 1572 ISDs.push_back(ISD::FEXP); 1573 break; 1574 case Intrinsic::exp2: 1575 ISDs.push_back(ISD::FEXP2); 1576 break; 1577 case Intrinsic::log: 1578 ISDs.push_back(ISD::FLOG); 1579 break; 1580 case Intrinsic::log10: 1581 ISDs.push_back(ISD::FLOG10); 1582 break; 1583 case Intrinsic::log2: 1584 ISDs.push_back(ISD::FLOG2); 1585 break; 1586 case Intrinsic::fabs: 1587 ISDs.push_back(ISD::FABS); 1588 break; 1589 case Intrinsic::canonicalize: 1590 ISDs.push_back(ISD::FCANONICALIZE); 1591 break; 1592 case Intrinsic::minnum: 1593 ISDs.push_back(ISD::FMINNUM); 1594 break; 1595 case Intrinsic::maxnum: 1596 ISDs.push_back(ISD::FMAXNUM); 1597 break; 1598 case Intrinsic::minimum: 1599 ISDs.push_back(ISD::FMINIMUM); 1600 break; 1601 case Intrinsic::maximum: 1602 ISDs.push_back(ISD::FMAXIMUM); 1603 break; 1604 case Intrinsic::copysign: 1605 ISDs.push_back(ISD::FCOPYSIGN); 1606 break; 1607 case Intrinsic::floor: 1608 ISDs.push_back(ISD::FFLOOR); 1609 break; 1610 case Intrinsic::ceil: 1611 ISDs.push_back(ISD::FCEIL); 1612 break; 1613 case Intrinsic::trunc: 1614 ISDs.push_back(ISD::FTRUNC); 1615 break; 1616 case Intrinsic::nearbyint: 1617 ISDs.push_back(ISD::FNEARBYINT); 1618 break; 1619 case Intrinsic::rint: 1620 ISDs.push_back(ISD::FRINT); 1621 break; 1622 case Intrinsic::round: 1623 ISDs.push_back(ISD::FROUND); 1624 break; 1625 case Intrinsic::roundeven: 1626 ISDs.push_back(ISD::FROUNDEVEN); 1627 break; 1628 case Intrinsic::pow: 1629 ISDs.push_back(ISD::FPOW); 1630 break; 1631 case Intrinsic::fma: 1632 ISDs.push_back(ISD::FMA); 1633 break; 1634 case Intrinsic::fmuladd: 1635 ISDs.push_back(ISD::FMA); 1636 break; 1637 case Intrinsic::experimental_constrained_fmuladd: 1638 ISDs.push_back(ISD::STRICT_FMA); 1639 break; 1640 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1641 case Intrinsic::lifetime_start: 1642 case Intrinsic::lifetime_end: 1643 case Intrinsic::sideeffect: 1644 case Intrinsic::pseudoprobe: 1645 case Intrinsic::arithmetic_fence: 1646 return 0; 1647 case Intrinsic::masked_store: { 1648 Type *Ty = Tys[0]; 1649 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1650 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1651 CostKind); 1652 } 1653 case Intrinsic::masked_load: { 1654 Type *Ty = RetTy; 1655 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1656 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1657 CostKind); 1658 } 1659 case Intrinsic::vector_reduce_add: 1660 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1661 None, CostKind); 1662 case Intrinsic::vector_reduce_mul: 1663 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1664 None, CostKind); 1665 case Intrinsic::vector_reduce_and: 1666 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1667 None, CostKind); 1668 case Intrinsic::vector_reduce_or: 1669 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, 1670 CostKind); 1671 case Intrinsic::vector_reduce_xor: 1672 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1673 None, CostKind); 1674 case Intrinsic::vector_reduce_fadd: 1675 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 1676 FMF, CostKind); 1677 case Intrinsic::vector_reduce_fmul: 1678 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 1679 FMF, CostKind); 1680 case Intrinsic::vector_reduce_smax: 1681 case Intrinsic::vector_reduce_smin: 1682 case Intrinsic::vector_reduce_fmax: 1683 case Intrinsic::vector_reduce_fmin: 1684 return thisT()->getMinMaxReductionCost( 1685 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1686 /*IsUnsigned=*/false, CostKind); 1687 case Intrinsic::vector_reduce_umax: 1688 case Intrinsic::vector_reduce_umin: 1689 return thisT()->getMinMaxReductionCost( 1690 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1691 /*IsUnsigned=*/true, CostKind); 1692 case Intrinsic::abs: 1693 case Intrinsic::smax: 1694 case Intrinsic::smin: 1695 case Intrinsic::umax: 1696 case Intrinsic::umin: { 1697 // abs(X) = select(icmp(X,0),X,sub(0,X)) 1698 // minmax(X,Y) = select(icmp(X,Y),X,Y) 1699 Type *CondTy = RetTy->getWithNewBitWidth(1); 1700 InstructionCost Cost = 0; 1701 // TODO: Ideally getCmpSelInstrCost would accept an icmp condition code. 1702 Cost += 1703 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1704 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1705 Cost += 1706 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1707 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1708 // TODO: Should we add an OperandValueProperties::OP_Zero property? 1709 if (IID == Intrinsic::abs) 1710 Cost += thisT()->getArithmeticInstrCost( 1711 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); 1712 return Cost; 1713 } 1714 case Intrinsic::sadd_sat: 1715 case Intrinsic::ssub_sat: { 1716 Type *CondTy = RetTy->getWithNewBitWidth(1); 1717 1718 Type *OpTy = StructType::create({RetTy, CondTy}); 1719 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1720 ? Intrinsic::sadd_with_overflow 1721 : Intrinsic::ssub_with_overflow; 1722 1723 // SatMax -> Overflow && SumDiff < 0 1724 // SatMin -> Overflow && SumDiff >= 0 1725 InstructionCost Cost = 0; 1726 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1727 nullptr, ScalarizationCostPassed); 1728 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1729 Cost += 1730 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1731 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1732 Cost += 2 * thisT()->getCmpSelInstrCost( 1733 BinaryOperator::Select, RetTy, CondTy, 1734 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1735 return Cost; 1736 } 1737 case Intrinsic::uadd_sat: 1738 case Intrinsic::usub_sat: { 1739 Type *CondTy = RetTy->getWithNewBitWidth(1); 1740 1741 Type *OpTy = StructType::create({RetTy, CondTy}); 1742 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1743 ? Intrinsic::uadd_with_overflow 1744 : Intrinsic::usub_with_overflow; 1745 1746 InstructionCost Cost = 0; 1747 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1748 nullptr, ScalarizationCostPassed); 1749 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1750 Cost += 1751 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1752 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1753 return Cost; 1754 } 1755 case Intrinsic::smul_fix: 1756 case Intrinsic::umul_fix: { 1757 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1758 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1759 1760 unsigned ExtOp = 1761 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1762 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1763 1764 InstructionCost Cost = 0; 1765 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 1766 Cost += 1767 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1768 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 1769 CCH, CostKind); 1770 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 1771 CostKind, TTI::OK_AnyValue, 1772 TTI::OK_UniformConstantValue); 1773 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 1774 TTI::OK_AnyValue, 1775 TTI::OK_UniformConstantValue); 1776 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 1777 return Cost; 1778 } 1779 case Intrinsic::sadd_with_overflow: 1780 case Intrinsic::ssub_with_overflow: { 1781 Type *SumTy = RetTy->getContainedType(0); 1782 Type *OverflowTy = RetTy->getContainedType(1); 1783 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1784 ? BinaryOperator::Add 1785 : BinaryOperator::Sub; 1786 1787 // LHSSign -> LHS >= 0 1788 // RHSSign -> RHS >= 0 1789 // SumSign -> Sum >= 0 1790 // 1791 // Add: 1792 // Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign) 1793 // Sub: 1794 // Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign) 1795 InstructionCost Cost = 0; 1796 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1797 Cost += 3 * thisT()->getCmpSelInstrCost( 1798 Instruction::ICmp, SumTy, OverflowTy, 1799 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1800 Cost += 2 * thisT()->getCmpSelInstrCost( 1801 Instruction::Select, OverflowTy, OverflowTy, 1802 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1803 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, OverflowTy, 1804 CostKind); 1805 return Cost; 1806 } 1807 case Intrinsic::uadd_with_overflow: 1808 case Intrinsic::usub_with_overflow: { 1809 Type *SumTy = RetTy->getContainedType(0); 1810 Type *OverflowTy = RetTy->getContainedType(1); 1811 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1812 ? BinaryOperator::Add 1813 : BinaryOperator::Sub; 1814 1815 InstructionCost Cost = 0; 1816 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1817 Cost += 1818 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 1819 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1820 return Cost; 1821 } 1822 case Intrinsic::smul_with_overflow: 1823 case Intrinsic::umul_with_overflow: { 1824 Type *MulTy = RetTy->getContainedType(0); 1825 Type *OverflowTy = RetTy->getContainedType(1); 1826 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1827 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1828 1829 unsigned ExtOp = 1830 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1831 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1832 1833 InstructionCost Cost = 0; 1834 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 1835 Cost += 1836 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1837 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 1838 CCH, CostKind); 1839 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, MulTy, 1840 CostKind, TTI::OK_AnyValue, 1841 TTI::OK_UniformConstantValue); 1842 1843 if (IID == Intrinsic::smul_with_overflow) 1844 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 1845 CostKind, TTI::OK_AnyValue, 1846 TTI::OK_UniformConstantValue); 1847 1848 Cost += 1849 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, MulTy, OverflowTy, 1850 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1851 return Cost; 1852 } 1853 case Intrinsic::ctpop: 1854 ISDs.push_back(ISD::CTPOP); 1855 // In case of legalization use TCC_Expensive. This is cheaper than a 1856 // library call but still not a cheap instruction. 1857 SingleCallCost = TargetTransformInfo::TCC_Expensive; 1858 break; 1859 case Intrinsic::ctlz: 1860 ISDs.push_back(ISD::CTLZ); 1861 break; 1862 case Intrinsic::cttz: 1863 ISDs.push_back(ISD::CTTZ); 1864 break; 1865 case Intrinsic::bswap: 1866 ISDs.push_back(ISD::BSWAP); 1867 break; 1868 case Intrinsic::bitreverse: 1869 ISDs.push_back(ISD::BITREVERSE); 1870 break; 1871 } 1872 1873 const TargetLoweringBase *TLI = getTLI(); 1874 std::pair<InstructionCost, MVT> LT = 1875 TLI->getTypeLegalizationCost(DL, RetTy); 1876 1877 SmallVector<InstructionCost, 2> LegalCost; 1878 SmallVector<InstructionCost, 2> CustomCost; 1879 for (unsigned ISD : ISDs) { 1880 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 1881 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 1882 TLI->isFAbsFree(LT.second)) { 1883 return 0; 1884 } 1885 1886 // The operation is legal. Assume it costs 1. 1887 // If the type is split to multiple registers, assume that there is some 1888 // overhead to this. 1889 // TODO: Once we have extract/insert subvector cost we need to use them. 1890 if (LT.first > 1) 1891 LegalCost.push_back(LT.first * 2); 1892 else 1893 LegalCost.push_back(LT.first * 1); 1894 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 1895 // If the operation is custom lowered then assume 1896 // that the code is twice as expensive. 1897 CustomCost.push_back(LT.first * 2); 1898 } 1899 } 1900 1901 auto *MinLegalCostI = std::min_element(LegalCost.begin(), LegalCost.end()); 1902 if (MinLegalCostI != LegalCost.end()) 1903 return *MinLegalCostI; 1904 1905 auto MinCustomCostI = 1906 std::min_element(CustomCost.begin(), CustomCost.end()); 1907 if (MinCustomCostI != CustomCost.end()) 1908 return *MinCustomCostI; 1909 1910 // If we can't lower fmuladd into an FMA estimate the cost as a floating 1911 // point mul followed by an add. 1912 if (IID == Intrinsic::fmuladd) 1913 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 1914 CostKind) + 1915 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 1916 CostKind); 1917 if (IID == Intrinsic::experimental_constrained_fmuladd) { 1918 IntrinsicCostAttributes FMulAttrs( 1919 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 1920 IntrinsicCostAttributes FAddAttrs( 1921 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 1922 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 1923 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 1924 } 1925 1926 // Else, assume that we need to scalarize this intrinsic. For math builtins 1927 // this will emit a costly libcall, adding call overhead and spills. Make it 1928 // very expensive. 1929 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1930 // Scalable vectors cannot be scalarized, so return Invalid. 1931 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1932 return isa<ScalableVectorType>(Ty); 1933 })) 1934 return InstructionCost::getInvalid(); 1935 1936 InstructionCost ScalarizationCost = 1937 SkipScalarizationCost ? ScalarizationCostPassed 1938 : getScalarizationOverhead(RetVTy, true, false); 1939 1940 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 1941 SmallVector<Type *, 4> ScalarTys; 1942 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1943 Type *Ty = Tys[i]; 1944 if (Ty->isVectorTy()) 1945 Ty = Ty->getScalarType(); 1946 ScalarTys.push_back(Ty); 1947 } 1948 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 1949 InstructionCost ScalarCost = 1950 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1951 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1952 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 1953 if (!ICA.skipScalarizationCost()) 1954 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1955 ScalarCalls = std::max(ScalarCalls, 1956 cast<FixedVectorType>(VTy)->getNumElements()); 1957 } 1958 } 1959 return ScalarCalls * ScalarCost + ScalarizationCost; 1960 } 1961 1962 // This is going to be turned into a library call, make it expensive. 1963 return SingleCallCost; 1964 } 1965 1966 /// Compute a cost of the given call instruction. 1967 /// 1968 /// Compute the cost of calling function F with return type RetTy and 1969 /// argument types Tys. F might be nullptr, in this case the cost of an 1970 /// arbitrary call with the specified signature will be returned. 1971 /// This is used, for instance, when we estimate call of a vector 1972 /// counterpart of the given function. 1973 /// \param F Called function, might be nullptr. 1974 /// \param RetTy Return value types. 1975 /// \param Tys Argument types. 1976 /// \returns The cost of Call instruction. 1977 InstructionCost 1978 getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys, 1979 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency) { 1980 return 10; 1981 } 1982 getNumberOfParts(Type * Tp)1983 unsigned getNumberOfParts(Type *Tp) { 1984 std::pair<InstructionCost, MVT> LT = 1985 getTLI()->getTypeLegalizationCost(DL, Tp); 1986 return *LT.first.getValue(); 1987 } 1988 getAddressComputationCost(Type * Ty,ScalarEvolution *,const SCEV *)1989 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 1990 const SCEV *) { 1991 return 0; 1992 } 1993 1994 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 1995 /// We're assuming that reduction operation are performing the following way: 1996 /// 1997 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 1998 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 1999 /// \----------------v-------------/ \----------v------------/ 2000 /// n/2 elements n/2 elements 2001 /// %red1 = op <n x t> %val, <n x t> val1 2002 /// After this operation we have a vector %red1 where only the first n/2 2003 /// elements are meaningful, the second n/2 elements are undefined and can be 2004 /// dropped. All other operations are actually working with the vector of 2005 /// length n/2, not n, though the real vector length is still n. 2006 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2007 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2008 /// \----------------v-------------/ \----------v------------/ 2009 /// n/4 elements 3*n/4 elements 2010 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2011 /// length n/2, the resulting vector has length n/4 etc. 2012 /// 2013 /// The cost model should take into account that the actual length of the 2014 /// vector is reduced on each iteration. getTreeReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2015 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2016 TTI::TargetCostKind CostKind) { 2017 Type *ScalarTy = Ty->getElementType(); 2018 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2019 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2020 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2021 NumVecElts >= 2) { 2022 // Or reduction for i1 is represented as: 2023 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2024 // %res = cmp ne iReduxWidth %val, 0 2025 // And reduction for i1 is represented as: 2026 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2027 // %res = cmp eq iReduxWidth %val, 11111 2028 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2029 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2030 TTI::CastContextHint::None, CostKind) + 2031 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2032 CmpInst::makeCmpResultType(ValTy), 2033 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2034 } 2035 unsigned NumReduxLevels = Log2_32(NumVecElts); 2036 InstructionCost ArithCost = 0; 2037 InstructionCost ShuffleCost = 0; 2038 std::pair<InstructionCost, MVT> LT = 2039 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2040 unsigned LongVectorCount = 0; 2041 unsigned MVTLen = 2042 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2043 while (NumVecElts > MVTLen) { 2044 NumVecElts /= 2; 2045 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2046 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2047 NumVecElts, SubTy); 2048 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2049 Ty = SubTy; 2050 ++LongVectorCount; 2051 } 2052 2053 NumReduxLevels -= LongVectorCount; 2054 2055 // The minimal length of the vector is limited by the real length of vector 2056 // operations performed on the current platform. That's why several final 2057 // reduction operations are performed on the vectors with the same 2058 // architecture-dependent length. 2059 2060 // By default reductions need one shuffle per reduction level. 2061 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2062 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2063 ArithCost += NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty); 2064 return ShuffleCost + ArithCost + 2065 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2066 } 2067 2068 /// Try to calculate the cost of performing strict (in-order) reductions, 2069 /// which involves doing a sequence of floating point additions in lane 2070 /// order, starting with an initial value. For example, consider a scalar 2071 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2072 /// 2073 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2074 /// 2075 /// %add1 = %InitVal + %v0 2076 /// %add2 = %add1 + %v1 2077 /// %add3 = %add2 + %v2 2078 /// %add4 = %add3 + %v3 2079 /// 2080 /// As a simple estimate we can say the cost of such a reduction is 4 times 2081 /// the cost of a scalar FP addition. We can only estimate the costs for 2082 /// fixed-width vectors here because for scalable vectors we do not know the 2083 /// runtime number of operations. getOrderedReductionCost(unsigned Opcode,VectorType * Ty,TTI::TargetCostKind CostKind)2084 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2085 TTI::TargetCostKind CostKind) { 2086 // Targets must implement a default value for the scalable case, since 2087 // we don't know how many lanes the vector has. 2088 if (isa<ScalableVectorType>(Ty)) 2089 return InstructionCost::getInvalid(); 2090 2091 auto *VTy = cast<FixedVectorType>(Ty); 2092 InstructionCost ExtractCost = 2093 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); 2094 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2095 Opcode, VTy->getElementType(), CostKind); 2096 ArithCost *= VTy->getNumElements(); 2097 2098 return ExtractCost + ArithCost; 2099 } 2100 getArithmeticReductionCost(unsigned Opcode,VectorType * Ty,Optional<FastMathFlags> FMF,TTI::TargetCostKind CostKind)2101 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2102 Optional<FastMathFlags> FMF, 2103 TTI::TargetCostKind CostKind) { 2104 if (TTI::requiresOrderedReduction(FMF)) 2105 return getOrderedReductionCost(Opcode, Ty, CostKind); 2106 return getTreeReductionCost(Opcode, Ty, CostKind); 2107 } 2108 2109 /// Try to calculate op costs for min/max reduction operations. 2110 /// \param CondTy Conditional type for the Select instruction. getMinMaxReductionCost(VectorType * Ty,VectorType * CondTy,bool IsUnsigned,TTI::TargetCostKind CostKind)2111 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2112 bool IsUnsigned, 2113 TTI::TargetCostKind CostKind) { 2114 Type *ScalarTy = Ty->getElementType(); 2115 Type *ScalarCondTy = CondTy->getElementType(); 2116 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2117 unsigned NumReduxLevels = Log2_32(NumVecElts); 2118 unsigned CmpOpcode; 2119 if (Ty->isFPOrFPVectorTy()) { 2120 CmpOpcode = Instruction::FCmp; 2121 } else { 2122 assert(Ty->isIntOrIntVectorTy() && 2123 "expecting floating point or integer type for min/max reduction"); 2124 CmpOpcode = Instruction::ICmp; 2125 } 2126 InstructionCost MinMaxCost = 0; 2127 InstructionCost ShuffleCost = 0; 2128 std::pair<InstructionCost, MVT> LT = 2129 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2130 unsigned LongVectorCount = 0; 2131 unsigned MVTLen = 2132 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2133 while (NumVecElts > MVTLen) { 2134 NumVecElts /= 2; 2135 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2136 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); 2137 2138 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2139 NumVecElts, SubTy); 2140 MinMaxCost += 2141 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, 2142 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2143 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 2144 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2145 Ty = SubTy; 2146 ++LongVectorCount; 2147 } 2148 2149 NumReduxLevels -= LongVectorCount; 2150 2151 // The minimal length of the vector is limited by the real length of vector 2152 // operations performed on the current platform. That's why several final 2153 // reduction opertions are perfomed on the vectors with the same 2154 // architecture-dependent length. 2155 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2156 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2157 MinMaxCost += 2158 NumReduxLevels * 2159 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, 2160 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2161 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 2162 CmpInst::BAD_ICMP_PREDICATE, CostKind)); 2163 // The last min/max should be in vector registers and we counted it above. 2164 // So just need a single extractelement. 2165 return ShuffleCost + MinMaxCost + 2166 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2167 } 2168 getExtendedAddReductionCost(bool IsMLA,bool IsUnsigned,Type * ResTy,VectorType * Ty,TTI::TargetCostKind CostKind)2169 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 2170 Type *ResTy, VectorType *Ty, 2171 TTI::TargetCostKind CostKind) { 2172 // Without any native support, this is equivalent to the cost of 2173 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) 2174 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2175 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2176 Instruction::Add, ExtTy, None, CostKind); 2177 InstructionCost MulCost = 0; 2178 InstructionCost ExtCost = thisT()->getCastInstrCost( 2179 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2180 TTI::CastContextHint::None, CostKind); 2181 if (IsMLA) { 2182 MulCost = 2183 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2184 ExtCost *= 2; 2185 } 2186 2187 return RedCost + MulCost + ExtCost; 2188 } 2189 getVectorSplitCost()2190 InstructionCost getVectorSplitCost() { return 1; } 2191 2192 /// @} 2193 }; 2194 2195 /// Concrete BasicTTIImpl that can be used if no further customization 2196 /// is needed. 2197 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2198 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2199 2200 friend class BasicTTIImplBase<BasicTTIImpl>; 2201 2202 const TargetSubtargetInfo *ST; 2203 const TargetLoweringBase *TLI; 2204 getST()2205 const TargetSubtargetInfo *getST() const { return ST; } getTLI()2206 const TargetLoweringBase *getTLI() const { return TLI; } 2207 2208 public: 2209 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2210 }; 2211 2212 } // end namespace llvm 2213 2214 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2215