1 //===- BasicTTIImpl.h -------------------------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// This file provides a helper that implements much of the TTI interface in 11 /// terms of the target-independent code generator and TargetLowering 12 /// interfaces. 13 // 14 //===----------------------------------------------------------------------===// 15 16 #ifndef LLVM_CODEGEN_BASICTTIIMPL_H 17 #define LLVM_CODEGEN_BASICTTIIMPL_H 18 19 #include "llvm/ADT/APInt.h" 20 #include "llvm/ADT/ArrayRef.h" 21 #include "llvm/ADT/BitVector.h" 22 #include "llvm/ADT/SmallPtrSet.h" 23 #include "llvm/ADT/SmallVector.h" 24 #include "llvm/Analysis/LoopInfo.h" 25 #include "llvm/Analysis/OptimizationRemarkEmitter.h" 26 #include "llvm/Analysis/TargetTransformInfo.h" 27 #include "llvm/Analysis/TargetTransformInfoImpl.h" 28 #include "llvm/CodeGen/ISDOpcodes.h" 29 #include "llvm/CodeGen/TargetLowering.h" 30 #include "llvm/CodeGen/TargetSubtargetInfo.h" 31 #include "llvm/CodeGen/ValueTypes.h" 32 #include "llvm/IR/BasicBlock.h" 33 #include "llvm/IR/Constant.h" 34 #include "llvm/IR/Constants.h" 35 #include "llvm/IR/DataLayout.h" 36 #include "llvm/IR/DerivedTypes.h" 37 #include "llvm/IR/InstrTypes.h" 38 #include "llvm/IR/Instruction.h" 39 #include "llvm/IR/Instructions.h" 40 #include "llvm/IR/Intrinsics.h" 41 #include "llvm/IR/Operator.h" 42 #include "llvm/IR/Type.h" 43 #include "llvm/IR/Value.h" 44 #include "llvm/Support/Casting.h" 45 #include "llvm/Support/CommandLine.h" 46 #include "llvm/Support/ErrorHandling.h" 47 #include "llvm/Support/MachineValueType.h" 48 #include "llvm/Support/MathExtras.h" 49 #include "llvm/Target/TargetMachine.h" 50 #include <algorithm> 51 #include <cassert> 52 #include <cstdint> 53 #include <limits> 54 #include <utility> 55 56 namespace llvm { 57 58 class Function; 59 class GlobalValue; 60 class LLVMContext; 61 class ScalarEvolution; 62 class SCEV; 63 class TargetMachine; 64 65 extern cl::opt<unsigned> PartialUnrollingThreshold; 66 67 /// Base class which can be used to help build a TTI implementation. 68 /// 69 /// This class provides as much implementation of the TTI interface as is 70 /// possible using the target independent parts of the code generator. 71 /// 72 /// In order to subclass it, your class must implement a getST() method to 73 /// return the subtarget, and a getTLI() method to return the target lowering. 74 /// We need these methods implemented in the derived class so that this class 75 /// doesn't have to duplicate storage for them. 76 template <typename T> 77 class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> { 78 private: 79 using BaseT = TargetTransformInfoImplCRTPBase<T>; 80 using TTI = TargetTransformInfo; 81 82 /// Helper function to access this as a T. 83 T *thisT() { return static_cast<T *>(this); } 84 85 /// Estimate a cost of Broadcast as an extract and sequence of insert 86 /// operations. 87 InstructionCost getBroadcastShuffleOverhead(FixedVectorType *VTy) { 88 InstructionCost Cost = 0; 89 // Broadcast cost is equal to the cost of extracting the zero'th element 90 // plus the cost of inserting it into every element of the result vector. 91 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 0); 92 93 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 94 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 95 } 96 return Cost; 97 } 98 99 /// Estimate a cost of shuffle as a sequence of extract and insert 100 /// operations. 101 InstructionCost getPermuteShuffleOverhead(FixedVectorType *VTy) { 102 InstructionCost Cost = 0; 103 // Shuffle cost is equal to the cost of extracting element from its argument 104 // plus the cost of inserting them onto the result vector. 105 106 // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from 107 // index 0 of first vector, index 1 of second vector,index 2 of first 108 // vector and finally index 3 of second vector and insert them at index 109 // <0,1,2,3> of result vector. 110 for (int i = 0, e = VTy->getNumElements(); i < e; ++i) { 111 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, i); 112 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, i); 113 } 114 return Cost; 115 } 116 117 /// Estimate a cost of subvector extraction as a sequence of extract and 118 /// insert operations. 119 InstructionCost getExtractSubvectorOverhead(VectorType *VTy, int Index, 120 FixedVectorType *SubVTy) { 121 assert(VTy && SubVTy && 122 "Can only extract subvectors from vectors"); 123 int NumSubElts = SubVTy->getNumElements(); 124 assert((!isa<FixedVectorType>(VTy) || 125 (Index + NumSubElts) <= 126 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 127 "SK_ExtractSubvector index out of range"); 128 129 InstructionCost Cost = 0; 130 // Subvector extraction cost is equal to the cost of extracting element from 131 // the source type plus the cost of inserting them into the result vector 132 // type. 133 for (int i = 0; i != NumSubElts; ++i) { 134 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, VTy, 135 i + Index); 136 Cost += 137 thisT()->getVectorInstrCost(Instruction::InsertElement, SubVTy, i); 138 } 139 return Cost; 140 } 141 142 /// Estimate a cost of subvector insertion as a sequence of extract and 143 /// insert operations. 144 InstructionCost getInsertSubvectorOverhead(VectorType *VTy, int Index, 145 FixedVectorType *SubVTy) { 146 assert(VTy && SubVTy && 147 "Can only insert subvectors into vectors"); 148 int NumSubElts = SubVTy->getNumElements(); 149 assert((!isa<FixedVectorType>(VTy) || 150 (Index + NumSubElts) <= 151 (int)cast<FixedVectorType>(VTy)->getNumElements()) && 152 "SK_InsertSubvector index out of range"); 153 154 InstructionCost Cost = 0; 155 // Subvector insertion cost is equal to the cost of extracting element from 156 // the source type plus the cost of inserting them into the result vector 157 // type. 158 for (int i = 0; i != NumSubElts; ++i) { 159 Cost += 160 thisT()->getVectorInstrCost(Instruction::ExtractElement, SubVTy, i); 161 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, VTy, 162 i + Index); 163 } 164 return Cost; 165 } 166 167 /// Local query method delegates up to T which *must* implement this! 168 const TargetSubtargetInfo *getST() const { 169 return static_cast<const T *>(this)->getST(); 170 } 171 172 /// Local query method delegates up to T which *must* implement this! 173 const TargetLoweringBase *getTLI() const { 174 return static_cast<const T *>(this)->getTLI(); 175 } 176 177 static ISD::MemIndexedMode getISDIndexedMode(TTI::MemIndexedMode M) { 178 switch (M) { 179 case TTI::MIM_Unindexed: 180 return ISD::UNINDEXED; 181 case TTI::MIM_PreInc: 182 return ISD::PRE_INC; 183 case TTI::MIM_PreDec: 184 return ISD::PRE_DEC; 185 case TTI::MIM_PostInc: 186 return ISD::POST_INC; 187 case TTI::MIM_PostDec: 188 return ISD::POST_DEC; 189 } 190 llvm_unreachable("Unexpected MemIndexedMode"); 191 } 192 193 InstructionCost getCommonMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 194 Align Alignment, 195 bool VariableMask, 196 bool IsGatherScatter, 197 TTI::TargetCostKind CostKind) { 198 // We cannot scalarize scalable vectors, so return Invalid. 199 if (isa<ScalableVectorType>(DataTy)) 200 return InstructionCost::getInvalid(); 201 202 auto *VT = cast<FixedVectorType>(DataTy); 203 // Assume the target does not have support for gather/scatter operations 204 // and provide a rough estimate. 205 // 206 // First, compute the cost of the individual memory operations. 207 InstructionCost AddrExtractCost = 208 IsGatherScatter 209 ? getVectorInstrCost(Instruction::ExtractElement, 210 FixedVectorType::get( 211 PointerType::get(VT->getElementType(), 0), 212 VT->getNumElements()), 213 -1) 214 : 0; 215 InstructionCost LoadCost = 216 VT->getNumElements() * 217 (AddrExtractCost + 218 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind)); 219 220 // Next, compute the cost of packing the result in a vector. 221 InstructionCost PackingCost = getScalarizationOverhead( 222 VT, Opcode != Instruction::Store, Opcode == Instruction::Store); 223 224 InstructionCost ConditionalCost = 0; 225 if (VariableMask) { 226 // Compute the cost of conditionally executing the memory operations with 227 // variable masks. This includes extracting the individual conditions, a 228 // branches and PHIs to combine the results. 229 // NOTE: Estimating the cost of conditionally executing the memory 230 // operations accurately is quite difficult and the current solution 231 // provides a very rough estimate only. 232 ConditionalCost = 233 VT->getNumElements() * 234 (getVectorInstrCost( 235 Instruction::ExtractElement, 236 FixedVectorType::get(Type::getInt1Ty(DataTy->getContext()), 237 VT->getNumElements()), 238 -1) + 239 getCFInstrCost(Instruction::Br, CostKind) + 240 getCFInstrCost(Instruction::PHI, CostKind)); 241 } 242 243 return LoadCost + PackingCost + ConditionalCost; 244 } 245 246 protected: 247 explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL) 248 : BaseT(DL) {} 249 virtual ~BasicTTIImplBase() = default; 250 251 using TargetTransformInfoImplBase::DL; 252 253 public: 254 /// \name Scalar TTI Implementations 255 /// @{ 256 bool allowsMisalignedMemoryAccesses(LLVMContext &Context, unsigned BitWidth, 257 unsigned AddressSpace, Align Alignment, 258 bool *Fast) const { 259 EVT E = EVT::getIntegerVT(Context, BitWidth); 260 return getTLI()->allowsMisalignedMemoryAccesses( 261 E, AddressSpace, Alignment, MachineMemOperand::MONone, Fast); 262 } 263 264 bool hasBranchDivergence() { return false; } 265 266 bool useGPUDivergenceAnalysis() { return false; } 267 268 bool isSourceOfDivergence(const Value *V) { return false; } 269 270 bool isAlwaysUniform(const Value *V) { return false; } 271 272 unsigned getFlatAddressSpace() { 273 // Return an invalid address space. 274 return -1; 275 } 276 277 bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes, 278 Intrinsic::ID IID) const { 279 return false; 280 } 281 282 bool isNoopAddrSpaceCast(unsigned FromAS, unsigned ToAS) const { 283 return getTLI()->getTargetMachine().isNoopAddrSpaceCast(FromAS, ToAS); 284 } 285 286 unsigned getAssumedAddrSpace(const Value *V) const { 287 return getTLI()->getTargetMachine().getAssumedAddrSpace(V); 288 } 289 290 std::pair<const Value *, unsigned> 291 getPredicatedAddrSpace(const Value *V) const { 292 return getTLI()->getTargetMachine().getPredicatedAddrSpace(V); 293 } 294 295 Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, 296 Value *NewV) const { 297 return nullptr; 298 } 299 300 bool isLegalAddImmediate(int64_t imm) { 301 return getTLI()->isLegalAddImmediate(imm); 302 } 303 304 bool isLegalICmpImmediate(int64_t imm) { 305 return getTLI()->isLegalICmpImmediate(imm); 306 } 307 308 bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, 309 bool HasBaseReg, int64_t Scale, 310 unsigned AddrSpace, Instruction *I = nullptr) { 311 TargetLoweringBase::AddrMode AM; 312 AM.BaseGV = BaseGV; 313 AM.BaseOffs = BaseOffset; 314 AM.HasBaseReg = HasBaseReg; 315 AM.Scale = Scale; 316 return getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace, I); 317 } 318 319 unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy, 320 Type *ScalarValTy) const { 321 auto &&IsSupportedByTarget = [this, ScalarMemTy, ScalarValTy](unsigned VF) { 322 auto *SrcTy = FixedVectorType::get(ScalarMemTy, VF / 2); 323 EVT VT = getTLI()->getValueType(DL, SrcTy); 324 if (getTLI()->isOperationLegal(ISD::STORE, VT) || 325 getTLI()->isOperationCustom(ISD::STORE, VT)) 326 return true; 327 328 EVT ValVT = 329 getTLI()->getValueType(DL, FixedVectorType::get(ScalarValTy, VF / 2)); 330 EVT LegalizedVT = 331 getTLI()->getTypeToTransformTo(ScalarMemTy->getContext(), VT); 332 return getTLI()->isTruncStoreLegal(LegalizedVT, ValVT); 333 }; 334 while (VF > 2 && IsSupportedByTarget(VF)) 335 VF /= 2; 336 return VF; 337 } 338 339 bool isIndexedLoadLegal(TTI::MemIndexedMode M, Type *Ty, 340 const DataLayout &DL) const { 341 EVT VT = getTLI()->getValueType(DL, Ty); 342 return getTLI()->isIndexedLoadLegal(getISDIndexedMode(M), VT); 343 } 344 345 bool isIndexedStoreLegal(TTI::MemIndexedMode M, Type *Ty, 346 const DataLayout &DL) const { 347 EVT VT = getTLI()->getValueType(DL, Ty); 348 return getTLI()->isIndexedStoreLegal(getISDIndexedMode(M), VT); 349 } 350 351 bool isLSRCostLess(TTI::LSRCost C1, TTI::LSRCost C2) { 352 return TargetTransformInfoImplBase::isLSRCostLess(C1, C2); 353 } 354 355 bool isNumRegsMajorCostOfLSR() { 356 return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR(); 357 } 358 359 bool isProfitableLSRChainElement(Instruction *I) { 360 return TargetTransformInfoImplBase::isProfitableLSRChainElement(I); 361 } 362 363 InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, 364 int64_t BaseOffset, bool HasBaseReg, 365 int64_t Scale, unsigned AddrSpace) { 366 TargetLoweringBase::AddrMode AM; 367 AM.BaseGV = BaseGV; 368 AM.BaseOffs = BaseOffset; 369 AM.HasBaseReg = HasBaseReg; 370 AM.Scale = Scale; 371 return getTLI()->getScalingFactorCost(DL, AM, Ty, AddrSpace); 372 } 373 374 bool isTruncateFree(Type *Ty1, Type *Ty2) { 375 return getTLI()->isTruncateFree(Ty1, Ty2); 376 } 377 378 bool isProfitableToHoist(Instruction *I) { 379 return getTLI()->isProfitableToHoist(I); 380 } 381 382 bool useAA() const { return getST()->useAA(); } 383 384 bool isTypeLegal(Type *Ty) { 385 EVT VT = getTLI()->getValueType(DL, Ty); 386 return getTLI()->isTypeLegal(VT); 387 } 388 389 unsigned getRegUsageForType(Type *Ty) { 390 EVT ETy = getTLI()->getValueType(DL, Ty); 391 return getTLI()->getNumRegisters(Ty->getContext(), ETy); 392 } 393 394 InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, 395 ArrayRef<const Value *> Operands, 396 TTI::TargetCostKind CostKind) { 397 return BaseT::getGEPCost(PointeeType, Ptr, Operands, CostKind); 398 } 399 400 unsigned getEstimatedNumberOfCaseClusters(const SwitchInst &SI, 401 unsigned &JumpTableSize, 402 ProfileSummaryInfo *PSI, 403 BlockFrequencyInfo *BFI) { 404 /// Try to find the estimated number of clusters. Note that the number of 405 /// clusters identified in this function could be different from the actual 406 /// numbers found in lowering. This function ignore switches that are 407 /// lowered with a mix of jump table / bit test / BTree. This function was 408 /// initially intended to be used when estimating the cost of switch in 409 /// inline cost heuristic, but it's a generic cost model to be used in other 410 /// places (e.g., in loop unrolling). 411 unsigned N = SI.getNumCases(); 412 const TargetLoweringBase *TLI = getTLI(); 413 const DataLayout &DL = this->getDataLayout(); 414 415 JumpTableSize = 0; 416 bool IsJTAllowed = TLI->areJTsAllowed(SI.getParent()->getParent()); 417 418 // Early exit if both a jump table and bit test are not allowed. 419 if (N < 1 || (!IsJTAllowed && DL.getIndexSizeInBits(0u) < N)) 420 return N; 421 422 APInt MaxCaseVal = SI.case_begin()->getCaseValue()->getValue(); 423 APInt MinCaseVal = MaxCaseVal; 424 for (auto CI : SI.cases()) { 425 const APInt &CaseVal = CI.getCaseValue()->getValue(); 426 if (CaseVal.sgt(MaxCaseVal)) 427 MaxCaseVal = CaseVal; 428 if (CaseVal.slt(MinCaseVal)) 429 MinCaseVal = CaseVal; 430 } 431 432 // Check if suitable for a bit test 433 if (N <= DL.getIndexSizeInBits(0u)) { 434 SmallPtrSet<const BasicBlock *, 4> Dests; 435 for (auto I : SI.cases()) 436 Dests.insert(I.getCaseSuccessor()); 437 438 if (TLI->isSuitableForBitTests(Dests.size(), N, MinCaseVal, MaxCaseVal, 439 DL)) 440 return 1; 441 } 442 443 // Check if suitable for a jump table. 444 if (IsJTAllowed) { 445 if (N < 2 || N < TLI->getMinimumJumpTableEntries()) 446 return N; 447 uint64_t Range = 448 (MaxCaseVal - MinCaseVal) 449 .getLimitedValue(std::numeric_limits<uint64_t>::max() - 1) + 1; 450 // Check whether a range of clusters is dense enough for a jump table 451 if (TLI->isSuitableForJumpTable(&SI, N, Range, PSI, BFI)) { 452 JumpTableSize = Range; 453 return 1; 454 } 455 } 456 return N; 457 } 458 459 bool shouldBuildLookupTables() { 460 const TargetLoweringBase *TLI = getTLI(); 461 return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) || 462 TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); 463 } 464 465 bool shouldBuildRelLookupTables() const { 466 const TargetMachine &TM = getTLI()->getTargetMachine(); 467 // If non-PIC mode, do not generate a relative lookup table. 468 if (!TM.isPositionIndependent()) 469 return false; 470 471 /// Relative lookup table entries consist of 32-bit offsets. 472 /// Do not generate relative lookup tables for large code models 473 /// in 64-bit achitectures where 32-bit offsets might not be enough. 474 if (TM.getCodeModel() == CodeModel::Medium || 475 TM.getCodeModel() == CodeModel::Large) 476 return false; 477 478 Triple TargetTriple = TM.getTargetTriple(); 479 if (!TargetTriple.isArch64Bit()) 480 return false; 481 482 // TODO: Triggers issues on aarch64 on darwin, so temporarily disable it 483 // there. 484 if (TargetTriple.getArch() == Triple::aarch64 && TargetTriple.isOSDarwin()) 485 return false; 486 487 return true; 488 } 489 490 bool haveFastSqrt(Type *Ty) { 491 const TargetLoweringBase *TLI = getTLI(); 492 EVT VT = TLI->getValueType(DL, Ty); 493 return TLI->isTypeLegal(VT) && 494 TLI->isOperationLegalOrCustom(ISD::FSQRT, VT); 495 } 496 497 bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { 498 return true; 499 } 500 501 InstructionCost getFPOpCost(Type *Ty) { 502 // Check whether FADD is available, as a proxy for floating-point in 503 // general. 504 const TargetLoweringBase *TLI = getTLI(); 505 EVT VT = TLI->getValueType(DL, Ty); 506 if (TLI->isOperationLegalOrCustomOrPromote(ISD::FADD, VT)) 507 return TargetTransformInfo::TCC_Basic; 508 return TargetTransformInfo::TCC_Expensive; 509 } 510 511 unsigned getInliningThresholdMultiplier() { return 1; } 512 unsigned adjustInliningThreshold(const CallBase *CB) { return 0; } 513 514 int getInlinerVectorBonusPercent() { return 150; } 515 516 void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, 517 TTI::UnrollingPreferences &UP, 518 OptimizationRemarkEmitter *ORE) { 519 // This unrolling functionality is target independent, but to provide some 520 // motivation for its intended use, for x86: 521 522 // According to the Intel 64 and IA-32 Architectures Optimization Reference 523 // Manual, Intel Core models and later have a loop stream detector (and 524 // associated uop queue) that can benefit from partial unrolling. 525 // The relevant requirements are: 526 // - The loop must have no more than 4 (8 for Nehalem and later) branches 527 // taken, and none of them may be calls. 528 // - The loop can have no more than 18 (28 for Nehalem and later) uops. 529 530 // According to the Software Optimization Guide for AMD Family 15h 531 // Processors, models 30h-4fh (Steamroller and later) have a loop predictor 532 // and loop buffer which can benefit from partial unrolling. 533 // The relevant requirements are: 534 // - The loop must have fewer than 16 branches 535 // - The loop must have less than 40 uops in all executed loop branches 536 537 // The number of taken branches in a loop is hard to estimate here, and 538 // benchmarking has revealed that it is better not to be conservative when 539 // estimating the branch count. As a result, we'll ignore the branch limits 540 // until someone finds a case where it matters in practice. 541 542 unsigned MaxOps; 543 const TargetSubtargetInfo *ST = getST(); 544 if (PartialUnrollingThreshold.getNumOccurrences() > 0) 545 MaxOps = PartialUnrollingThreshold; 546 else if (ST->getSchedModel().LoopMicroOpBufferSize > 0) 547 MaxOps = ST->getSchedModel().LoopMicroOpBufferSize; 548 else 549 return; 550 551 // Scan the loop: don't unroll loops with calls. 552 for (BasicBlock *BB : L->blocks()) { 553 for (Instruction &I : *BB) { 554 if (isa<CallInst>(I) || isa<InvokeInst>(I)) { 555 if (const Function *F = cast<CallBase>(I).getCalledFunction()) { 556 if (!thisT()->isLoweredToCall(F)) 557 continue; 558 } 559 560 if (ORE) { 561 ORE->emit([&]() { 562 return OptimizationRemark("TTI", "DontUnroll", L->getStartLoc(), 563 L->getHeader()) 564 << "advising against unrolling the loop because it " 565 "contains a " 566 << ore::NV("Call", &I); 567 }); 568 } 569 return; 570 } 571 } 572 } 573 574 // Enable runtime and partial unrolling up to the specified size. 575 // Enable using trip count upper bound to unroll loops. 576 UP.Partial = UP.Runtime = UP.UpperBound = true; 577 UP.PartialThreshold = MaxOps; 578 579 // Avoid unrolling when optimizing for size. 580 UP.OptSizeThreshold = 0; 581 UP.PartialOptSizeThreshold = 0; 582 583 // Set number of instructions optimized when "back edge" 584 // becomes "fall through" to default value of 2. 585 UP.BEInsns = 2; 586 } 587 588 void getPeelingPreferences(Loop *L, ScalarEvolution &SE, 589 TTI::PeelingPreferences &PP) { 590 PP.PeelCount = 0; 591 PP.AllowPeeling = true; 592 PP.AllowLoopNestsPeeling = false; 593 PP.PeelProfiledIterations = true; 594 } 595 596 bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, 597 AssumptionCache &AC, 598 TargetLibraryInfo *LibInfo, 599 HardwareLoopInfo &HWLoopInfo) { 600 return BaseT::isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo); 601 } 602 603 bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, 604 AssumptionCache &AC, TargetLibraryInfo *TLI, 605 DominatorTree *DT, 606 LoopVectorizationLegality *LVL) { 607 return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LVL); 608 } 609 610 PredicationStyle emitGetActiveLaneMask() { 611 return BaseT::emitGetActiveLaneMask(); 612 } 613 614 Optional<Instruction *> instCombineIntrinsic(InstCombiner &IC, 615 IntrinsicInst &II) { 616 return BaseT::instCombineIntrinsic(IC, II); 617 } 618 619 Optional<Value *> simplifyDemandedUseBitsIntrinsic(InstCombiner &IC, 620 IntrinsicInst &II, 621 APInt DemandedMask, 622 KnownBits &Known, 623 bool &KnownBitsComputed) { 624 return BaseT::simplifyDemandedUseBitsIntrinsic(IC, II, DemandedMask, Known, 625 KnownBitsComputed); 626 } 627 628 Optional<Value *> simplifyDemandedVectorEltsIntrinsic( 629 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, 630 APInt &UndefElts2, APInt &UndefElts3, 631 std::function<void(Instruction *, unsigned, APInt, APInt &)> 632 SimplifyAndSetOp) { 633 return BaseT::simplifyDemandedVectorEltsIntrinsic( 634 IC, II, DemandedElts, UndefElts, UndefElts2, UndefElts3, 635 SimplifyAndSetOp); 636 } 637 638 InstructionCost getInstructionLatency(const Instruction *I) { 639 if (isa<LoadInst>(I)) 640 return getST()->getSchedModel().DefaultLoadLatency; 641 642 return BaseT::getInstructionLatency(I); 643 } 644 645 virtual Optional<unsigned> 646 getCacheSize(TargetTransformInfo::CacheLevel Level) const { 647 return Optional<unsigned>( 648 getST()->getCacheSize(static_cast<unsigned>(Level))); 649 } 650 651 virtual Optional<unsigned> 652 getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const { 653 Optional<unsigned> TargetResult = 654 getST()->getCacheAssociativity(static_cast<unsigned>(Level)); 655 656 if (TargetResult) 657 return TargetResult; 658 659 return BaseT::getCacheAssociativity(Level); 660 } 661 662 virtual unsigned getCacheLineSize() const { 663 return getST()->getCacheLineSize(); 664 } 665 666 virtual unsigned getPrefetchDistance() const { 667 return getST()->getPrefetchDistance(); 668 } 669 670 virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses, 671 unsigned NumStridedMemAccesses, 672 unsigned NumPrefetches, 673 bool HasCall) const { 674 return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, 675 NumPrefetches, HasCall); 676 } 677 678 virtual unsigned getMaxPrefetchIterationsAhead() const { 679 return getST()->getMaxPrefetchIterationsAhead(); 680 } 681 682 virtual bool enableWritePrefetching() const { 683 return getST()->enableWritePrefetching(); 684 } 685 686 /// @} 687 688 /// \name Vector TTI Implementations 689 /// @{ 690 691 TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { 692 return TypeSize::getFixed(32); 693 } 694 695 Optional<unsigned> getMaxVScale() const { return None; } 696 Optional<unsigned> getVScaleForTuning() const { return None; } 697 698 /// Estimate the overhead of scalarizing an instruction. Insert and Extract 699 /// are set if the demanded result elements need to be inserted and/or 700 /// extracted from vectors. 701 InstructionCost getScalarizationOverhead(VectorType *InTy, 702 const APInt &DemandedElts, 703 bool Insert, bool Extract) { 704 /// FIXME: a bitfield is not a reasonable abstraction for talking about 705 /// which elements are needed from a scalable vector 706 if (isa<ScalableVectorType>(InTy)) 707 return InstructionCost::getInvalid(); 708 auto *Ty = cast<FixedVectorType>(InTy); 709 710 assert(DemandedElts.getBitWidth() == Ty->getNumElements() && 711 "Vector size mismatch"); 712 713 InstructionCost Cost = 0; 714 715 for (int i = 0, e = Ty->getNumElements(); i < e; ++i) { 716 if (!DemandedElts[i]) 717 continue; 718 if (Insert) 719 Cost += thisT()->getVectorInstrCost(Instruction::InsertElement, Ty, i); 720 if (Extract) 721 Cost += thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, i); 722 } 723 724 return Cost; 725 } 726 727 /// Helper wrapper for the DemandedElts variant of getScalarizationOverhead. 728 InstructionCost getScalarizationOverhead(VectorType *InTy, bool Insert, 729 bool Extract) { 730 if (isa<ScalableVectorType>(InTy)) 731 return InstructionCost::getInvalid(); 732 auto *Ty = cast<FixedVectorType>(InTy); 733 734 APInt DemandedElts = APInt::getAllOnes(Ty->getNumElements()); 735 return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); 736 } 737 738 /// Estimate the overhead of scalarizing an instructions unique 739 /// non-constant operands. The (potentially vector) types to use for each of 740 /// argument are passes via Tys. 741 InstructionCost getOperandsScalarizationOverhead(ArrayRef<const Value *> Args, 742 ArrayRef<Type *> Tys) { 743 assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); 744 745 InstructionCost Cost = 0; 746 SmallPtrSet<const Value*, 4> UniqueOperands; 747 for (int I = 0, E = Args.size(); I != E; I++) { 748 // Disregard things like metadata arguments. 749 const Value *A = Args[I]; 750 Type *Ty = Tys[I]; 751 if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && 752 !Ty->isPtrOrPtrVectorTy()) 753 continue; 754 755 if (!isa<Constant>(A) && UniqueOperands.insert(A).second) { 756 if (auto *VecTy = dyn_cast<VectorType>(Ty)) 757 Cost += getScalarizationOverhead(VecTy, false, true); 758 } 759 } 760 761 return Cost; 762 } 763 764 /// Estimate the overhead of scalarizing the inputs and outputs of an 765 /// instruction, with return type RetTy and arguments Args of type Tys. If 766 /// Args are unknown (empty), then the cost associated with one argument is 767 /// added as a heuristic. 768 InstructionCost getScalarizationOverhead(VectorType *RetTy, 769 ArrayRef<const Value *> Args, 770 ArrayRef<Type *> Tys) { 771 InstructionCost Cost = getScalarizationOverhead(RetTy, true, false); 772 if (!Args.empty()) 773 Cost += getOperandsScalarizationOverhead(Args, Tys); 774 else 775 // When no information on arguments is provided, we add the cost 776 // associated with one argument as a heuristic. 777 Cost += getScalarizationOverhead(RetTy, false, true); 778 779 return Cost; 780 } 781 782 unsigned getMaxInterleaveFactor(unsigned VF) { return 1; } 783 784 InstructionCost getArithmeticInstrCost( 785 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, 786 TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue, 787 TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue, 788 TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None, 789 TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None, 790 ArrayRef<const Value *> Args = ArrayRef<const Value *>(), 791 const Instruction *CxtI = nullptr) { 792 // Check if any of the operands are vector operands. 793 const TargetLoweringBase *TLI = getTLI(); 794 int ISD = TLI->InstructionOpcodeToISD(Opcode); 795 assert(ISD && "Invalid opcode"); 796 797 // TODO: Handle more cost kinds. 798 if (CostKind != TTI::TCK_RecipThroughput) 799 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, 800 Opd1Info, Opd2Info, 801 Opd1PropInfo, Opd2PropInfo, 802 Args, CxtI); 803 804 std::pair<InstructionCost, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty); 805 806 bool IsFloat = Ty->isFPOrFPVectorTy(); 807 // Assume that floating point arithmetic operations cost twice as much as 808 // integer operations. 809 InstructionCost OpCost = (IsFloat ? 2 : 1); 810 811 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 812 // The operation is legal. Assume it costs 1. 813 // TODO: Once we have extract/insert subvector cost we need to use them. 814 return LT.first * OpCost; 815 } 816 817 if (!TLI->isOperationExpand(ISD, LT.second)) { 818 // If the operation is custom lowered, then assume that the code is twice 819 // as expensive. 820 return LT.first * 2 * OpCost; 821 } 822 823 // An 'Expand' of URem and SRem is special because it may default 824 // to expanding the operation into a sequence of sub-operations 825 // i.e. X % Y -> X-(X/Y)*Y. 826 if (ISD == ISD::UREM || ISD == ISD::SREM) { 827 bool IsSigned = ISD == ISD::SREM; 828 if (TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, 829 LT.second) || 830 TLI->isOperationLegalOrCustom(IsSigned ? ISD::SDIV : ISD::UDIV, 831 LT.second)) { 832 unsigned DivOpc = IsSigned ? Instruction::SDiv : Instruction::UDiv; 833 InstructionCost DivCost = thisT()->getArithmeticInstrCost( 834 DivOpc, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, 835 Opd2PropInfo); 836 InstructionCost MulCost = 837 thisT()->getArithmeticInstrCost(Instruction::Mul, Ty, CostKind); 838 InstructionCost SubCost = 839 thisT()->getArithmeticInstrCost(Instruction::Sub, Ty, CostKind); 840 return DivCost + MulCost + SubCost; 841 } 842 } 843 844 // We cannot scalarize scalable vectors, so return Invalid. 845 if (isa<ScalableVectorType>(Ty)) 846 return InstructionCost::getInvalid(); 847 848 // Else, assume that we need to scalarize this op. 849 // TODO: If one of the types get legalized by splitting, handle this 850 // similarly to what getCastInstrCost() does. 851 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) { 852 InstructionCost Cost = thisT()->getArithmeticInstrCost( 853 Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info, 854 Opd1PropInfo, Opd2PropInfo, Args, CxtI); 855 // Return the cost of multiple scalar invocation plus the cost of 856 // inserting and extracting the values. 857 SmallVector<Type *> Tys(Args.size(), Ty); 858 return getScalarizationOverhead(VTy, Args, Tys) + 859 VTy->getNumElements() * Cost; 860 } 861 862 // We don't know anything about this scalar instruction. 863 return OpCost; 864 } 865 866 TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, 867 ArrayRef<int> Mask) const { 868 int Limit = Mask.size() * 2; 869 if (Mask.empty() || 870 // Extra check required by isSingleSourceMaskImpl function (called by 871 // ShuffleVectorInst::isSingleSourceMask). 872 any_of(Mask, [Limit](int I) { return I >= Limit; })) 873 return Kind; 874 switch (Kind) { 875 case TTI::SK_PermuteSingleSrc: 876 if (ShuffleVectorInst::isReverseMask(Mask)) 877 return TTI::SK_Reverse; 878 if (ShuffleVectorInst::isZeroEltSplatMask(Mask)) 879 return TTI::SK_Broadcast; 880 break; 881 case TTI::SK_PermuteTwoSrc: 882 if (ShuffleVectorInst::isSelectMask(Mask)) 883 return TTI::SK_Select; 884 if (ShuffleVectorInst::isTransposeMask(Mask)) 885 return TTI::SK_Transpose; 886 break; 887 case TTI::SK_Select: 888 case TTI::SK_Reverse: 889 case TTI::SK_Broadcast: 890 case TTI::SK_Transpose: 891 case TTI::SK_InsertSubvector: 892 case TTI::SK_ExtractSubvector: 893 case TTI::SK_Splice: 894 break; 895 } 896 return Kind; 897 } 898 899 InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, 900 ArrayRef<int> Mask, int Index, 901 VectorType *SubTp, 902 ArrayRef<const Value *> Args = None) { 903 904 switch (improveShuffleKindFromMask(Kind, Mask)) { 905 case TTI::SK_Broadcast: 906 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 907 return getBroadcastShuffleOverhead(FVT); 908 return InstructionCost::getInvalid(); 909 case TTI::SK_Select: 910 case TTI::SK_Splice: 911 case TTI::SK_Reverse: 912 case TTI::SK_Transpose: 913 case TTI::SK_PermuteSingleSrc: 914 case TTI::SK_PermuteTwoSrc: 915 if (auto *FVT = dyn_cast<FixedVectorType>(Tp)) 916 return getPermuteShuffleOverhead(FVT); 917 return InstructionCost::getInvalid(); 918 case TTI::SK_ExtractSubvector: 919 return getExtractSubvectorOverhead(Tp, Index, 920 cast<FixedVectorType>(SubTp)); 921 case TTI::SK_InsertSubvector: 922 return getInsertSubvectorOverhead(Tp, Index, 923 cast<FixedVectorType>(SubTp)); 924 } 925 llvm_unreachable("Unknown TTI::ShuffleKind"); 926 } 927 928 InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, 929 TTI::CastContextHint CCH, 930 TTI::TargetCostKind CostKind, 931 const Instruction *I = nullptr) { 932 if (BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I) == 0) 933 return 0; 934 935 const TargetLoweringBase *TLI = getTLI(); 936 int ISD = TLI->InstructionOpcodeToISD(Opcode); 937 assert(ISD && "Invalid opcode"); 938 std::pair<InstructionCost, MVT> SrcLT = 939 TLI->getTypeLegalizationCost(DL, Src); 940 std::pair<InstructionCost, MVT> DstLT = 941 TLI->getTypeLegalizationCost(DL, Dst); 942 943 TypeSize SrcSize = SrcLT.second.getSizeInBits(); 944 TypeSize DstSize = DstLT.second.getSizeInBits(); 945 bool IntOrPtrSrc = Src->isIntegerTy() || Src->isPointerTy(); 946 bool IntOrPtrDst = Dst->isIntegerTy() || Dst->isPointerTy(); 947 948 switch (Opcode) { 949 default: 950 break; 951 case Instruction::Trunc: 952 // Check for NOOP conversions. 953 if (TLI->isTruncateFree(SrcLT.second, DstLT.second)) 954 return 0; 955 LLVM_FALLTHROUGH; 956 case Instruction::BitCast: 957 // Bitcast between types that are legalized to the same type are free and 958 // assume int to/from ptr of the same size is also free. 959 if (SrcLT.first == DstLT.first && IntOrPtrSrc == IntOrPtrDst && 960 SrcSize == DstSize) 961 return 0; 962 break; 963 case Instruction::FPExt: 964 if (I && getTLI()->isExtFree(I)) 965 return 0; 966 break; 967 case Instruction::ZExt: 968 if (TLI->isZExtFree(SrcLT.second, DstLT.second)) 969 return 0; 970 LLVM_FALLTHROUGH; 971 case Instruction::SExt: 972 if (I && getTLI()->isExtFree(I)) 973 return 0; 974 975 // If this is a zext/sext of a load, return 0 if the corresponding 976 // extending load exists on target and the result type is legal. 977 if (CCH == TTI::CastContextHint::Normal) { 978 EVT ExtVT = EVT::getEVT(Dst); 979 EVT LoadVT = EVT::getEVT(Src); 980 unsigned LType = 981 ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD); 982 if (DstLT.first == SrcLT.first && 983 TLI->isLoadExtLegal(LType, ExtVT, LoadVT)) 984 return 0; 985 } 986 break; 987 case Instruction::AddrSpaceCast: 988 if (TLI->isFreeAddrSpaceCast(Src->getPointerAddressSpace(), 989 Dst->getPointerAddressSpace())) 990 return 0; 991 break; 992 } 993 994 auto *SrcVTy = dyn_cast<VectorType>(Src); 995 auto *DstVTy = dyn_cast<VectorType>(Dst); 996 997 // If the cast is marked as legal (or promote) then assume low cost. 998 if (SrcLT.first == DstLT.first && 999 TLI->isOperationLegalOrPromote(ISD, DstLT.second)) 1000 return SrcLT.first; 1001 1002 // Handle scalar conversions. 1003 if (!SrcVTy && !DstVTy) { 1004 // Just check the op cost. If the operation is legal then assume it costs 1005 // 1. 1006 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1007 return 1; 1008 1009 // Assume that illegal scalar instruction are expensive. 1010 return 4; 1011 } 1012 1013 // Check vector-to-vector casts. 1014 if (DstVTy && SrcVTy) { 1015 // If the cast is between same-sized registers, then the check is simple. 1016 if (SrcLT.first == DstLT.first && SrcSize == DstSize) { 1017 1018 // Assume that Zext is done using AND. 1019 if (Opcode == Instruction::ZExt) 1020 return SrcLT.first; 1021 1022 // Assume that sext is done using SHL and SRA. 1023 if (Opcode == Instruction::SExt) 1024 return SrcLT.first * 2; 1025 1026 // Just check the op cost. If the operation is legal then assume it 1027 // costs 1028 // 1 and multiply by the type-legalization overhead. 1029 if (!TLI->isOperationExpand(ISD, DstLT.second)) 1030 return SrcLT.first * 1; 1031 } 1032 1033 // If we are legalizing by splitting, query the concrete TTI for the cost 1034 // of casting the original vector twice. We also need to factor in the 1035 // cost of the split itself. Count that as 1, to be consistent with 1036 // TLI->getTypeLegalizationCost(). 1037 bool SplitSrc = 1038 TLI->getTypeAction(Src->getContext(), TLI->getValueType(DL, Src)) == 1039 TargetLowering::TypeSplitVector; 1040 bool SplitDst = 1041 TLI->getTypeAction(Dst->getContext(), TLI->getValueType(DL, Dst)) == 1042 TargetLowering::TypeSplitVector; 1043 if ((SplitSrc || SplitDst) && SrcVTy->getElementCount().isVector() && 1044 DstVTy->getElementCount().isVector()) { 1045 Type *SplitDstTy = VectorType::getHalfElementsVectorType(DstVTy); 1046 Type *SplitSrcTy = VectorType::getHalfElementsVectorType(SrcVTy); 1047 T *TTI = static_cast<T *>(this); 1048 // If both types need to be split then the split is free. 1049 InstructionCost SplitCost = 1050 (!SplitSrc || !SplitDst) ? TTI->getVectorSplitCost() : 0; 1051 return SplitCost + 1052 (2 * TTI->getCastInstrCost(Opcode, SplitDstTy, SplitSrcTy, CCH, 1053 CostKind, I)); 1054 } 1055 1056 // Scalarization cost is Invalid, can't assume any num elements. 1057 if (isa<ScalableVectorType>(DstVTy)) 1058 return InstructionCost::getInvalid(); 1059 1060 // In other cases where the source or destination are illegal, assume 1061 // the operation will get scalarized. 1062 unsigned Num = cast<FixedVectorType>(DstVTy)->getNumElements(); 1063 InstructionCost Cost = thisT()->getCastInstrCost( 1064 Opcode, Dst->getScalarType(), Src->getScalarType(), CCH, CostKind, I); 1065 1066 // Return the cost of multiple scalar invocation plus the cost of 1067 // inserting and extracting the values. 1068 return getScalarizationOverhead(DstVTy, true, true) + Num * Cost; 1069 } 1070 1071 // We already handled vector-to-vector and scalar-to-scalar conversions. 1072 // This 1073 // is where we handle bitcast between vectors and scalars. We need to assume 1074 // that the conversion is scalarized in one way or another. 1075 if (Opcode == Instruction::BitCast) { 1076 // Illegal bitcasts are done by storing and loading from a stack slot. 1077 return (SrcVTy ? getScalarizationOverhead(SrcVTy, false, true) : 0) + 1078 (DstVTy ? getScalarizationOverhead(DstVTy, true, false) : 0); 1079 } 1080 1081 llvm_unreachable("Unhandled cast"); 1082 } 1083 1084 InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, 1085 VectorType *VecTy, unsigned Index) { 1086 return thisT()->getVectorInstrCost(Instruction::ExtractElement, VecTy, 1087 Index) + 1088 thisT()->getCastInstrCost(Opcode, Dst, VecTy->getElementType(), 1089 TTI::CastContextHint::None, 1090 TTI::TCK_RecipThroughput); 1091 } 1092 1093 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, 1094 const Instruction *I = nullptr) { 1095 return BaseT::getCFInstrCost(Opcode, CostKind, I); 1096 } 1097 1098 InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, 1099 CmpInst::Predicate VecPred, 1100 TTI::TargetCostKind CostKind, 1101 const Instruction *I = nullptr) { 1102 const TargetLoweringBase *TLI = getTLI(); 1103 int ISD = TLI->InstructionOpcodeToISD(Opcode); 1104 assert(ISD && "Invalid opcode"); 1105 1106 // TODO: Handle other cost kinds. 1107 if (CostKind != TTI::TCK_RecipThroughput) 1108 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, 1109 I); 1110 1111 // Selects on vectors are actually vector selects. 1112 if (ISD == ISD::SELECT) { 1113 assert(CondTy && "CondTy must exist"); 1114 if (CondTy->isVectorTy()) 1115 ISD = ISD::VSELECT; 1116 } 1117 std::pair<InstructionCost, MVT> LT = 1118 TLI->getTypeLegalizationCost(DL, ValTy); 1119 1120 if (!(ValTy->isVectorTy() && !LT.second.isVector()) && 1121 !TLI->isOperationExpand(ISD, LT.second)) { 1122 // The operation is legal. Assume it costs 1. Multiply 1123 // by the type-legalization overhead. 1124 return LT.first * 1; 1125 } 1126 1127 // Otherwise, assume that the cast is scalarized. 1128 // TODO: If one of the types get legalized by splitting, handle this 1129 // similarly to what getCastInstrCost() does. 1130 if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) { 1131 if (isa<ScalableVectorType>(ValTy)) 1132 return InstructionCost::getInvalid(); 1133 1134 unsigned Num = cast<FixedVectorType>(ValVTy)->getNumElements(); 1135 if (CondTy) 1136 CondTy = CondTy->getScalarType(); 1137 InstructionCost Cost = thisT()->getCmpSelInstrCost( 1138 Opcode, ValVTy->getScalarType(), CondTy, VecPred, CostKind, I); 1139 1140 // Return the cost of multiple scalar invocation plus the cost of 1141 // inserting and extracting the values. 1142 return getScalarizationOverhead(ValVTy, true, false) + Num * Cost; 1143 } 1144 1145 // Unknown scalar opcode. 1146 return 1; 1147 } 1148 1149 InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, 1150 unsigned Index) { 1151 std::pair<InstructionCost, MVT> LT = 1152 getTLI()->getTypeLegalizationCost(DL, Val->getScalarType()); 1153 1154 return LT.first; 1155 } 1156 1157 InstructionCost getReplicationShuffleCost(Type *EltTy, int ReplicationFactor, 1158 int VF, 1159 const APInt &DemandedDstElts, 1160 TTI::TargetCostKind CostKind) { 1161 assert(DemandedDstElts.getBitWidth() == (unsigned)VF * ReplicationFactor && 1162 "Unexpected size of DemandedDstElts."); 1163 1164 InstructionCost Cost; 1165 1166 auto *SrcVT = FixedVectorType::get(EltTy, VF); 1167 auto *ReplicatedVT = FixedVectorType::get(EltTy, VF * ReplicationFactor); 1168 1169 // The Mask shuffling cost is extract all the elements of the Mask 1170 // and insert each of them Factor times into the wide vector: 1171 // 1172 // E.g. an interleaved group with factor 3: 1173 // %mask = icmp ult <8 x i32> %vec1, %vec2 1174 // %interleaved.mask = shufflevector <8 x i1> %mask, <8 x i1> undef, 1175 // <24 x i32> <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7> 1176 // The cost is estimated as extract all mask elements from the <8xi1> mask 1177 // vector and insert them factor times into the <24xi1> shuffled mask 1178 // vector. 1179 APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedDstElts, VF); 1180 Cost += thisT()->getScalarizationOverhead(SrcVT, DemandedSrcElts, 1181 /*Insert*/ false, 1182 /*Extract*/ true); 1183 Cost += 1184 thisT()->getScalarizationOverhead(ReplicatedVT, DemandedDstElts, 1185 /*Insert*/ true, /*Extract*/ false); 1186 1187 return Cost; 1188 } 1189 1190 InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, 1191 MaybeAlign Alignment, unsigned AddressSpace, 1192 TTI::TargetCostKind CostKind, 1193 const Instruction *I = nullptr) { 1194 assert(!Src->isVoidTy() && "Invalid type"); 1195 // Assume types, such as structs, are expensive. 1196 if (getTLI()->getValueType(DL, Src, true) == MVT::Other) 1197 return 4; 1198 std::pair<InstructionCost, MVT> LT = 1199 getTLI()->getTypeLegalizationCost(DL, Src); 1200 1201 // Assuming that all loads of legal types cost 1. 1202 InstructionCost Cost = LT.first; 1203 if (CostKind != TTI::TCK_RecipThroughput) 1204 return Cost; 1205 1206 const DataLayout &DL = this->getDataLayout(); 1207 if (Src->isVectorTy() && 1208 // In practice it's not currently possible to have a change in lane 1209 // length for extending loads or truncating stores so both types should 1210 // have the same scalable property. 1211 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src), 1212 LT.second.getSizeInBits())) { 1213 // This is a vector load that legalizes to a larger type than the vector 1214 // itself. Unless the corresponding extending load or truncating store is 1215 // legal, then this will scalarize. 1216 TargetLowering::LegalizeAction LA = TargetLowering::Expand; 1217 EVT MemVT = getTLI()->getValueType(DL, Src); 1218 if (Opcode == Instruction::Store) 1219 LA = getTLI()->getTruncStoreAction(LT.second, MemVT); 1220 else 1221 LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, LT.second, MemVT); 1222 1223 if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) { 1224 // This is a vector load/store for some illegal type that is scalarized. 1225 // We must account for the cost of building or decomposing the vector. 1226 Cost += getScalarizationOverhead(cast<VectorType>(Src), 1227 Opcode != Instruction::Store, 1228 Opcode == Instruction::Store); 1229 } 1230 } 1231 1232 return Cost; 1233 } 1234 1235 InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, 1236 Align Alignment, unsigned AddressSpace, 1237 TTI::TargetCostKind CostKind) { 1238 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false, 1239 CostKind); 1240 } 1241 1242 InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, 1243 const Value *Ptr, bool VariableMask, 1244 Align Alignment, 1245 TTI::TargetCostKind CostKind, 1246 const Instruction *I = nullptr) { 1247 return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, VariableMask, 1248 true, CostKind); 1249 } 1250 1251 InstructionCost getInterleavedMemoryOpCost( 1252 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, 1253 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, 1254 bool UseMaskForCond = false, bool UseMaskForGaps = false) { 1255 1256 // We cannot scalarize scalable vectors, so return Invalid. 1257 if (isa<ScalableVectorType>(VecTy)) 1258 return InstructionCost::getInvalid(); 1259 1260 auto *VT = cast<FixedVectorType>(VecTy); 1261 1262 unsigned NumElts = VT->getNumElements(); 1263 assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor"); 1264 1265 unsigned NumSubElts = NumElts / Factor; 1266 auto *SubVT = FixedVectorType::get(VT->getElementType(), NumSubElts); 1267 1268 // Firstly, the cost of load/store operation. 1269 InstructionCost Cost; 1270 if (UseMaskForCond || UseMaskForGaps) 1271 Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment, 1272 AddressSpace, CostKind); 1273 else 1274 Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, 1275 CostKind); 1276 1277 // Legalize the vector type, and get the legalized and unlegalized type 1278 // sizes. 1279 MVT VecTyLT = getTLI()->getTypeLegalizationCost(DL, VecTy).second; 1280 unsigned VecTySize = thisT()->getDataLayout().getTypeStoreSize(VecTy); 1281 unsigned VecTyLTSize = VecTyLT.getStoreSize(); 1282 1283 // Scale the cost of the memory operation by the fraction of legalized 1284 // instructions that will actually be used. We shouldn't account for the 1285 // cost of dead instructions since they will be removed. 1286 // 1287 // E.g., An interleaved load of factor 8: 1288 // %vec = load <16 x i64>, <16 x i64>* %ptr 1289 // %v0 = shufflevector %vec, undef, <0, 8> 1290 // 1291 // If <16 x i64> is legalized to 8 v2i64 loads, only 2 of the loads will be 1292 // used (those corresponding to elements [0:1] and [8:9] of the unlegalized 1293 // type). The other loads are unused. 1294 // 1295 // TODO: Note that legalization can turn masked loads/stores into unmasked 1296 // (legalized) loads/stores. This can be reflected in the cost. 1297 if (Cost.isValid() && VecTySize > VecTyLTSize) { 1298 // The number of loads of a legal type it will take to represent a load 1299 // of the unlegalized vector type. 1300 unsigned NumLegalInsts = divideCeil(VecTySize, VecTyLTSize); 1301 1302 // The number of elements of the unlegalized type that correspond to a 1303 // single legal instruction. 1304 unsigned NumEltsPerLegalInst = divideCeil(NumElts, NumLegalInsts); 1305 1306 // Determine which legal instructions will be used. 1307 BitVector UsedInsts(NumLegalInsts, false); 1308 for (unsigned Index : Indices) 1309 for (unsigned Elt = 0; Elt < NumSubElts; ++Elt) 1310 UsedInsts.set((Index + Elt * Factor) / NumEltsPerLegalInst); 1311 1312 // Scale the cost of the load by the fraction of legal instructions that 1313 // will be used. 1314 Cost = divideCeil(UsedInsts.count() * *Cost.getValue(), NumLegalInsts); 1315 } 1316 1317 // Then plus the cost of interleave operation. 1318 assert(Indices.size() <= Factor && 1319 "Interleaved memory op has too many members"); 1320 1321 const APInt DemandedAllSubElts = APInt::getAllOnes(NumSubElts); 1322 const APInt DemandedAllResultElts = APInt::getAllOnes(NumElts); 1323 1324 APInt DemandedLoadStoreElts = APInt::getZero(NumElts); 1325 for (unsigned Index : Indices) { 1326 assert(Index < Factor && "Invalid index for interleaved memory op"); 1327 for (unsigned Elm = 0; Elm < NumSubElts; Elm++) 1328 DemandedLoadStoreElts.setBit(Index + Elm * Factor); 1329 } 1330 1331 if (Opcode == Instruction::Load) { 1332 // The interleave cost is similar to extract sub vectors' elements 1333 // from the wide vector, and insert them into sub vectors. 1334 // 1335 // E.g. An interleaved load of factor 2 (with one member of index 0): 1336 // %vec = load <8 x i32>, <8 x i32>* %ptr 1337 // %v0 = shuffle %vec, undef, <0, 2, 4, 6> ; Index 0 1338 // The cost is estimated as extract elements at 0, 2, 4, 6 from the 1339 // <8 x i32> vector and insert them into a <4 x i32> vector. 1340 InstructionCost InsSubCost = 1341 thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, 1342 /*Insert*/ true, /*Extract*/ false); 1343 Cost += Indices.size() * InsSubCost; 1344 Cost += 1345 thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1346 /*Insert*/ false, /*Extract*/ true); 1347 } else { 1348 // The interleave cost is extract elements from sub vectors, and 1349 // insert them into the wide vector. 1350 // 1351 // E.g. An interleaved store of factor 3 with 2 members at indices 0,1: 1352 // (using VF=4): 1353 // %v0_v1 = shuffle %v0, %v1, <0,4,undef,1,5,undef,2,6,undef,3,7,undef> 1354 // %gaps.mask = <true, true, false, true, true, false, 1355 // true, true, false, true, true, false> 1356 // call llvm.masked.store <12 x i32> %v0_v1, <12 x i32>* %ptr, 1357 // i32 Align, <12 x i1> %gaps.mask 1358 // The cost is estimated as extract all elements (of actual members, 1359 // excluding gaps) from both <4 x i32> vectors and insert into the <12 x 1360 // i32> vector. 1361 InstructionCost ExtSubCost = 1362 thisT()->getScalarizationOverhead(SubVT, DemandedAllSubElts, 1363 /*Insert*/ false, /*Extract*/ true); 1364 Cost += ExtSubCost * Indices.size(); 1365 Cost += thisT()->getScalarizationOverhead(VT, DemandedLoadStoreElts, 1366 /*Insert*/ true, 1367 /*Extract*/ false); 1368 } 1369 1370 if (!UseMaskForCond) 1371 return Cost; 1372 1373 Type *I8Type = Type::getInt8Ty(VT->getContext()); 1374 1375 Cost += thisT()->getReplicationShuffleCost( 1376 I8Type, Factor, NumSubElts, 1377 UseMaskForGaps ? DemandedLoadStoreElts : DemandedAllResultElts, 1378 CostKind); 1379 1380 // The Gaps mask is invariant and created outside the loop, therefore the 1381 // cost of creating it is not accounted for here. However if we have both 1382 // a MaskForGaps and some other mask that guards the execution of the 1383 // memory access, we need to account for the cost of And-ing the two masks 1384 // inside the loop. 1385 if (UseMaskForGaps) { 1386 auto *MaskVT = FixedVectorType::get(I8Type, NumElts); 1387 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::And, MaskVT, 1388 CostKind); 1389 } 1390 1391 return Cost; 1392 } 1393 1394 /// Get intrinsic cost based on arguments. 1395 InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1396 TTI::TargetCostKind CostKind) { 1397 // Check for generically free intrinsics. 1398 if (BaseT::getIntrinsicInstrCost(ICA, CostKind) == 0) 1399 return 0; 1400 1401 // Assume that target intrinsics are cheap. 1402 Intrinsic::ID IID = ICA.getID(); 1403 if (Function::isTargetIntrinsic(IID)) 1404 return TargetTransformInfo::TCC_Basic; 1405 1406 if (ICA.isTypeBasedOnly()) 1407 return getTypeBasedIntrinsicInstrCost(ICA, CostKind); 1408 1409 Type *RetTy = ICA.getReturnType(); 1410 1411 ElementCount RetVF = 1412 (RetTy->isVectorTy() ? cast<VectorType>(RetTy)->getElementCount() 1413 : ElementCount::getFixed(1)); 1414 const IntrinsicInst *I = ICA.getInst(); 1415 const SmallVectorImpl<const Value *> &Args = ICA.getArgs(); 1416 FastMathFlags FMF = ICA.getFlags(); 1417 switch (IID) { 1418 default: 1419 break; 1420 1421 case Intrinsic::powi: 1422 if (auto *RHSC = dyn_cast<ConstantInt>(Args[1])) { 1423 bool ShouldOptForSize = I->getParent()->getParent()->hasOptSize(); 1424 if (getTLI()->isBeneficialToExpandPowI(RHSC->getSExtValue(), 1425 ShouldOptForSize)) { 1426 // The cost is modeled on the expansion performed by ExpandPowI in 1427 // SelectionDAGBuilder. 1428 APInt Exponent = RHSC->getValue().abs(); 1429 unsigned ActiveBits = Exponent.getActiveBits(); 1430 unsigned PopCount = Exponent.countPopulation(); 1431 InstructionCost Cost = (ActiveBits + PopCount - 2) * 1432 thisT()->getArithmeticInstrCost( 1433 Instruction::FMul, RetTy, CostKind); 1434 if (RHSC->getSExtValue() < 0) 1435 Cost += thisT()->getArithmeticInstrCost(Instruction::FDiv, RetTy, 1436 CostKind); 1437 return Cost; 1438 } 1439 } 1440 break; 1441 case Intrinsic::cttz: 1442 // FIXME: If necessary, this should go in target-specific overrides. 1443 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCttz()) 1444 return TargetTransformInfo::TCC_Basic; 1445 break; 1446 1447 case Intrinsic::ctlz: 1448 // FIXME: If necessary, this should go in target-specific overrides. 1449 if (RetVF.isScalar() && getTLI()->isCheapToSpeculateCtlz()) 1450 return TargetTransformInfo::TCC_Basic; 1451 break; 1452 1453 case Intrinsic::memcpy: 1454 return thisT()->getMemcpyCost(ICA.getInst()); 1455 1456 case Intrinsic::masked_scatter: { 1457 const Value *Mask = Args[3]; 1458 bool VarMask = !isa<Constant>(Mask); 1459 Align Alignment = cast<ConstantInt>(Args[2])->getAlignValue(); 1460 return thisT()->getGatherScatterOpCost(Instruction::Store, 1461 ICA.getArgTypes()[0], Args[1], 1462 VarMask, Alignment, CostKind, I); 1463 } 1464 case Intrinsic::masked_gather: { 1465 const Value *Mask = Args[2]; 1466 bool VarMask = !isa<Constant>(Mask); 1467 Align Alignment = cast<ConstantInt>(Args[1])->getAlignValue(); 1468 return thisT()->getGatherScatterOpCost(Instruction::Load, RetTy, Args[0], 1469 VarMask, Alignment, CostKind, I); 1470 } 1471 case Intrinsic::experimental_stepvector: { 1472 if (isa<ScalableVectorType>(RetTy)) 1473 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1474 // The cost of materialising a constant integer vector. 1475 return TargetTransformInfo::TCC_Basic; 1476 } 1477 case Intrinsic::vector_extract: { 1478 // FIXME: Handle case where a scalable vector is extracted from a scalable 1479 // vector 1480 if (isa<ScalableVectorType>(RetTy)) 1481 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1482 unsigned Index = cast<ConstantInt>(Args[1])->getZExtValue(); 1483 return thisT()->getShuffleCost(TTI::SK_ExtractSubvector, 1484 cast<VectorType>(Args[0]->getType()), None, 1485 Index, cast<VectorType>(RetTy)); 1486 } 1487 case Intrinsic::vector_insert: { 1488 // FIXME: Handle case where a scalable vector is inserted into a scalable 1489 // vector 1490 if (isa<ScalableVectorType>(Args[1]->getType())) 1491 return BaseT::getIntrinsicInstrCost(ICA, CostKind); 1492 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1493 return thisT()->getShuffleCost( 1494 TTI::SK_InsertSubvector, cast<VectorType>(Args[0]->getType()), None, 1495 Index, cast<VectorType>(Args[1]->getType())); 1496 } 1497 case Intrinsic::experimental_vector_reverse: { 1498 return thisT()->getShuffleCost(TTI::SK_Reverse, 1499 cast<VectorType>(Args[0]->getType()), None, 1500 0, cast<VectorType>(RetTy)); 1501 } 1502 case Intrinsic::experimental_vector_splice: { 1503 unsigned Index = cast<ConstantInt>(Args[2])->getZExtValue(); 1504 return thisT()->getShuffleCost(TTI::SK_Splice, 1505 cast<VectorType>(Args[0]->getType()), None, 1506 Index, cast<VectorType>(RetTy)); 1507 } 1508 case Intrinsic::vector_reduce_add: 1509 case Intrinsic::vector_reduce_mul: 1510 case Intrinsic::vector_reduce_and: 1511 case Intrinsic::vector_reduce_or: 1512 case Intrinsic::vector_reduce_xor: 1513 case Intrinsic::vector_reduce_smax: 1514 case Intrinsic::vector_reduce_smin: 1515 case Intrinsic::vector_reduce_fmax: 1516 case Intrinsic::vector_reduce_fmin: 1517 case Intrinsic::vector_reduce_umax: 1518 case Intrinsic::vector_reduce_umin: { 1519 IntrinsicCostAttributes Attrs(IID, RetTy, Args[0]->getType(), FMF, I, 1); 1520 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1521 } 1522 case Intrinsic::vector_reduce_fadd: 1523 case Intrinsic::vector_reduce_fmul: { 1524 IntrinsicCostAttributes Attrs( 1525 IID, RetTy, {Args[0]->getType(), Args[1]->getType()}, FMF, I, 1); 1526 return getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1527 } 1528 case Intrinsic::fshl: 1529 case Intrinsic::fshr: { 1530 const Value *X = Args[0]; 1531 const Value *Y = Args[1]; 1532 const Value *Z = Args[2]; 1533 TTI::OperandValueProperties OpPropsX, OpPropsY, OpPropsZ, OpPropsBW; 1534 TTI::OperandValueKind OpKindX = TTI::getOperandInfo(X, OpPropsX); 1535 TTI::OperandValueKind OpKindY = TTI::getOperandInfo(Y, OpPropsY); 1536 TTI::OperandValueKind OpKindZ = TTI::getOperandInfo(Z, OpPropsZ); 1537 TTI::OperandValueKind OpKindBW = TTI::OK_UniformConstantValue; 1538 OpPropsBW = isPowerOf2_32(RetTy->getScalarSizeInBits()) ? TTI::OP_PowerOf2 1539 : TTI::OP_None; 1540 // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW))) 1541 // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW)) 1542 InstructionCost Cost = 0; 1543 Cost += 1544 thisT()->getArithmeticInstrCost(BinaryOperator::Or, RetTy, CostKind); 1545 Cost += 1546 thisT()->getArithmeticInstrCost(BinaryOperator::Sub, RetTy, CostKind); 1547 Cost += thisT()->getArithmeticInstrCost( 1548 BinaryOperator::Shl, RetTy, CostKind, OpKindX, OpKindZ, OpPropsX); 1549 Cost += thisT()->getArithmeticInstrCost( 1550 BinaryOperator::LShr, RetTy, CostKind, OpKindY, OpKindZ, OpPropsY); 1551 // Non-constant shift amounts requires a modulo. 1552 if (OpKindZ != TTI::OK_UniformConstantValue && 1553 OpKindZ != TTI::OK_NonUniformConstantValue) 1554 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::URem, RetTy, 1555 CostKind, OpKindZ, OpKindBW, 1556 OpPropsZ, OpPropsBW); 1557 // For non-rotates (X != Y) we must add shift-by-zero handling costs. 1558 if (X != Y) { 1559 Type *CondTy = RetTy->getWithNewBitWidth(1); 1560 Cost += 1561 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1562 CmpInst::ICMP_EQ, CostKind); 1563 Cost += 1564 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1565 CmpInst::ICMP_EQ, CostKind); 1566 } 1567 return Cost; 1568 } 1569 case Intrinsic::get_active_lane_mask: { 1570 EVT ResVT = getTLI()->getValueType(DL, RetTy, true); 1571 EVT ArgType = getTLI()->getValueType(DL, ICA.getArgTypes()[0], true); 1572 1573 // If we're not expanding the intrinsic then we assume this is cheap 1574 // to implement. 1575 if (!getTLI()->shouldExpandGetActiveLaneMask(ResVT, ArgType)) { 1576 std::pair<InstructionCost, MVT> LT = 1577 getTLI()->getTypeLegalizationCost(DL, RetTy); 1578 return LT.first; 1579 } 1580 1581 // Create the expanded types that will be used to calculate the uadd_sat 1582 // operation. 1583 Type *ExpRetTy = VectorType::get( 1584 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount()); 1585 IntrinsicCostAttributes Attrs(Intrinsic::uadd_sat, ExpRetTy, {}, FMF); 1586 InstructionCost Cost = 1587 thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1588 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, ExpRetTy, RetTy, 1589 CmpInst::ICMP_ULT, CostKind); 1590 return Cost; 1591 } 1592 } 1593 1594 // Assume that we need to scalarize this intrinsic. 1595 // Compute the scalarization overhead based on Args for a vector 1596 // intrinsic. 1597 InstructionCost ScalarizationCost = InstructionCost::getInvalid(); 1598 if (RetVF.isVector() && !RetVF.isScalable()) { 1599 ScalarizationCost = 0; 1600 if (!RetTy->isVoidTy()) 1601 ScalarizationCost += 1602 getScalarizationOverhead(cast<VectorType>(RetTy), true, false); 1603 ScalarizationCost += 1604 getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); 1605 } 1606 1607 IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, 1608 ScalarizationCost); 1609 return thisT()->getTypeBasedIntrinsicInstrCost(Attrs, CostKind); 1610 } 1611 1612 /// Get intrinsic cost based on argument types. 1613 /// If ScalarizationCostPassed is std::numeric_limits<unsigned>::max(), the 1614 /// cost of scalarizing the arguments and the return value will be computed 1615 /// based on types. 1616 InstructionCost 1617 getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, 1618 TTI::TargetCostKind CostKind) { 1619 Intrinsic::ID IID = ICA.getID(); 1620 Type *RetTy = ICA.getReturnType(); 1621 const SmallVectorImpl<Type *> &Tys = ICA.getArgTypes(); 1622 FastMathFlags FMF = ICA.getFlags(); 1623 InstructionCost ScalarizationCostPassed = ICA.getScalarizationCost(); 1624 bool SkipScalarizationCost = ICA.skipScalarizationCost(); 1625 1626 VectorType *VecOpTy = nullptr; 1627 if (!Tys.empty()) { 1628 // The vector reduction operand is operand 0 except for fadd/fmul. 1629 // Their operand 0 is a scalar start value, so the vector op is operand 1. 1630 unsigned VecTyIndex = 0; 1631 if (IID == Intrinsic::vector_reduce_fadd || 1632 IID == Intrinsic::vector_reduce_fmul) 1633 VecTyIndex = 1; 1634 assert(Tys.size() > VecTyIndex && "Unexpected IntrinsicCostAttributes"); 1635 VecOpTy = dyn_cast<VectorType>(Tys[VecTyIndex]); 1636 } 1637 1638 // Library call cost - other than size, make it expensive. 1639 unsigned SingleCallCost = CostKind == TTI::TCK_CodeSize ? 1 : 10; 1640 unsigned ISD = 0; 1641 switch (IID) { 1642 default: { 1643 // Scalable vectors cannot be scalarized, so return Invalid. 1644 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 1645 return isa<ScalableVectorType>(Ty); 1646 })) 1647 return InstructionCost::getInvalid(); 1648 1649 // Assume that we need to scalarize this intrinsic. 1650 InstructionCost ScalarizationCost = 1651 SkipScalarizationCost ? ScalarizationCostPassed : 0; 1652 unsigned ScalarCalls = 1; 1653 Type *ScalarRetTy = RetTy; 1654 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 1655 if (!SkipScalarizationCost) 1656 ScalarizationCost = getScalarizationOverhead(RetVTy, true, false); 1657 ScalarCalls = std::max(ScalarCalls, 1658 cast<FixedVectorType>(RetVTy)->getNumElements()); 1659 ScalarRetTy = RetTy->getScalarType(); 1660 } 1661 SmallVector<Type *, 4> ScalarTys; 1662 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 1663 Type *Ty = Tys[i]; 1664 if (auto *VTy = dyn_cast<VectorType>(Ty)) { 1665 if (!SkipScalarizationCost) 1666 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 1667 ScalarCalls = std::max(ScalarCalls, 1668 cast<FixedVectorType>(VTy)->getNumElements()); 1669 Ty = Ty->getScalarType(); 1670 } 1671 ScalarTys.push_back(Ty); 1672 } 1673 if (ScalarCalls == 1) 1674 return 1; // Return cost of a scalar intrinsic. Assume it to be cheap. 1675 1676 IntrinsicCostAttributes ScalarAttrs(IID, ScalarRetTy, ScalarTys, FMF); 1677 InstructionCost ScalarCost = 1678 thisT()->getIntrinsicInstrCost(ScalarAttrs, CostKind); 1679 1680 return ScalarCalls * ScalarCost + ScalarizationCost; 1681 } 1682 // Look for intrinsics that can be lowered directly or turned into a scalar 1683 // intrinsic call. 1684 case Intrinsic::sqrt: 1685 ISD = ISD::FSQRT; 1686 break; 1687 case Intrinsic::sin: 1688 ISD = ISD::FSIN; 1689 break; 1690 case Intrinsic::cos: 1691 ISD = ISD::FCOS; 1692 break; 1693 case Intrinsic::exp: 1694 ISD = ISD::FEXP; 1695 break; 1696 case Intrinsic::exp2: 1697 ISD = ISD::FEXP2; 1698 break; 1699 case Intrinsic::log: 1700 ISD = ISD::FLOG; 1701 break; 1702 case Intrinsic::log10: 1703 ISD = ISD::FLOG10; 1704 break; 1705 case Intrinsic::log2: 1706 ISD = ISD::FLOG2; 1707 break; 1708 case Intrinsic::fabs: 1709 ISD = ISD::FABS; 1710 break; 1711 case Intrinsic::canonicalize: 1712 ISD = ISD::FCANONICALIZE; 1713 break; 1714 case Intrinsic::minnum: 1715 ISD = ISD::FMINNUM; 1716 break; 1717 case Intrinsic::maxnum: 1718 ISD = ISD::FMAXNUM; 1719 break; 1720 case Intrinsic::minimum: 1721 ISD = ISD::FMINIMUM; 1722 break; 1723 case Intrinsic::maximum: 1724 ISD = ISD::FMAXIMUM; 1725 break; 1726 case Intrinsic::copysign: 1727 ISD = ISD::FCOPYSIGN; 1728 break; 1729 case Intrinsic::floor: 1730 ISD = ISD::FFLOOR; 1731 break; 1732 case Intrinsic::ceil: 1733 ISD = ISD::FCEIL; 1734 break; 1735 case Intrinsic::trunc: 1736 ISD = ISD::FTRUNC; 1737 break; 1738 case Intrinsic::nearbyint: 1739 ISD = ISD::FNEARBYINT; 1740 break; 1741 case Intrinsic::rint: 1742 ISD = ISD::FRINT; 1743 break; 1744 case Intrinsic::round: 1745 ISD = ISD::FROUND; 1746 break; 1747 case Intrinsic::roundeven: 1748 ISD = ISD::FROUNDEVEN; 1749 break; 1750 case Intrinsic::pow: 1751 ISD = ISD::FPOW; 1752 break; 1753 case Intrinsic::fma: 1754 ISD = ISD::FMA; 1755 break; 1756 case Intrinsic::fmuladd: 1757 ISD = ISD::FMA; 1758 break; 1759 case Intrinsic::experimental_constrained_fmuladd: 1760 ISD = ISD::STRICT_FMA; 1761 break; 1762 // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free. 1763 case Intrinsic::lifetime_start: 1764 case Intrinsic::lifetime_end: 1765 case Intrinsic::sideeffect: 1766 case Intrinsic::pseudoprobe: 1767 case Intrinsic::arithmetic_fence: 1768 return 0; 1769 case Intrinsic::masked_store: { 1770 Type *Ty = Tys[0]; 1771 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1772 return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0, 1773 CostKind); 1774 } 1775 case Intrinsic::masked_load: { 1776 Type *Ty = RetTy; 1777 Align TyAlign = thisT()->DL.getABITypeAlign(Ty); 1778 return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0, 1779 CostKind); 1780 } 1781 case Intrinsic::vector_reduce_add: 1782 return thisT()->getArithmeticReductionCost(Instruction::Add, VecOpTy, 1783 None, CostKind); 1784 case Intrinsic::vector_reduce_mul: 1785 return thisT()->getArithmeticReductionCost(Instruction::Mul, VecOpTy, 1786 None, CostKind); 1787 case Intrinsic::vector_reduce_and: 1788 return thisT()->getArithmeticReductionCost(Instruction::And, VecOpTy, 1789 None, CostKind); 1790 case Intrinsic::vector_reduce_or: 1791 return thisT()->getArithmeticReductionCost(Instruction::Or, VecOpTy, None, 1792 CostKind); 1793 case Intrinsic::vector_reduce_xor: 1794 return thisT()->getArithmeticReductionCost(Instruction::Xor, VecOpTy, 1795 None, CostKind); 1796 case Intrinsic::vector_reduce_fadd: 1797 return thisT()->getArithmeticReductionCost(Instruction::FAdd, VecOpTy, 1798 FMF, CostKind); 1799 case Intrinsic::vector_reduce_fmul: 1800 return thisT()->getArithmeticReductionCost(Instruction::FMul, VecOpTy, 1801 FMF, CostKind); 1802 case Intrinsic::vector_reduce_smax: 1803 case Intrinsic::vector_reduce_smin: 1804 case Intrinsic::vector_reduce_fmax: 1805 case Intrinsic::vector_reduce_fmin: 1806 return thisT()->getMinMaxReductionCost( 1807 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1808 /*IsUnsigned=*/false, CostKind); 1809 case Intrinsic::vector_reduce_umax: 1810 case Intrinsic::vector_reduce_umin: 1811 return thisT()->getMinMaxReductionCost( 1812 VecOpTy, cast<VectorType>(CmpInst::makeCmpResultType(VecOpTy)), 1813 /*IsUnsigned=*/true, CostKind); 1814 case Intrinsic::abs: { 1815 // abs(X) = select(icmp(X,0),X,sub(0,X)) 1816 Type *CondTy = RetTy->getWithNewBitWidth(1); 1817 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 1818 InstructionCost Cost = 0; 1819 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1820 Pred, CostKind); 1821 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1822 Pred, CostKind); 1823 // TODO: Should we add an OperandValueProperties::OP_Zero property? 1824 Cost += thisT()->getArithmeticInstrCost( 1825 BinaryOperator::Sub, RetTy, CostKind, TTI::OK_UniformConstantValue); 1826 return Cost; 1827 } 1828 case Intrinsic::smax: 1829 case Intrinsic::smin: 1830 case Intrinsic::umax: 1831 case Intrinsic::umin: { 1832 // minmax(X,Y) = select(icmp(X,Y),X,Y) 1833 Type *CondTy = RetTy->getWithNewBitWidth(1); 1834 bool IsUnsigned = IID == Intrinsic::umax || IID == Intrinsic::umin; 1835 CmpInst::Predicate Pred = 1836 IsUnsigned ? CmpInst::ICMP_UGT : CmpInst::ICMP_SGT; 1837 InstructionCost Cost = 0; 1838 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1839 Pred, CostKind); 1840 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1841 Pred, CostKind); 1842 return Cost; 1843 } 1844 case Intrinsic::sadd_sat: 1845 case Intrinsic::ssub_sat: { 1846 Type *CondTy = RetTy->getWithNewBitWidth(1); 1847 1848 Type *OpTy = StructType::create({RetTy, CondTy}); 1849 Intrinsic::ID OverflowOp = IID == Intrinsic::sadd_sat 1850 ? Intrinsic::sadd_with_overflow 1851 : Intrinsic::ssub_with_overflow; 1852 CmpInst::Predicate Pred = CmpInst::ICMP_SGT; 1853 1854 // SatMax -> Overflow && SumDiff < 0 1855 // SatMin -> Overflow && SumDiff >= 0 1856 InstructionCost Cost = 0; 1857 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1858 nullptr, ScalarizationCostPassed); 1859 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1860 Cost += thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, 1861 Pred, CostKind); 1862 Cost += 2 * thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, 1863 CondTy, Pred, CostKind); 1864 return Cost; 1865 } 1866 case Intrinsic::uadd_sat: 1867 case Intrinsic::usub_sat: { 1868 Type *CondTy = RetTy->getWithNewBitWidth(1); 1869 1870 Type *OpTy = StructType::create({RetTy, CondTy}); 1871 Intrinsic::ID OverflowOp = IID == Intrinsic::uadd_sat 1872 ? Intrinsic::uadd_with_overflow 1873 : Intrinsic::usub_with_overflow; 1874 1875 InstructionCost Cost = 0; 1876 IntrinsicCostAttributes Attrs(OverflowOp, OpTy, {RetTy, RetTy}, FMF, 1877 nullptr, ScalarizationCostPassed); 1878 Cost += thisT()->getIntrinsicInstrCost(Attrs, CostKind); 1879 Cost += 1880 thisT()->getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, 1881 CmpInst::BAD_ICMP_PREDICATE, CostKind); 1882 return Cost; 1883 } 1884 case Intrinsic::smul_fix: 1885 case Intrinsic::umul_fix: { 1886 unsigned ExtSize = RetTy->getScalarSizeInBits() * 2; 1887 Type *ExtTy = RetTy->getWithNewBitWidth(ExtSize); 1888 1889 unsigned ExtOp = 1890 IID == Intrinsic::smul_fix ? Instruction::SExt : Instruction::ZExt; 1891 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1892 1893 InstructionCost Cost = 0; 1894 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, RetTy, CCH, CostKind); 1895 Cost += 1896 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1897 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, RetTy, ExtTy, 1898 CCH, CostKind); 1899 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, RetTy, 1900 CostKind, TTI::OK_AnyValue, 1901 TTI::OK_UniformConstantValue); 1902 Cost += thisT()->getArithmeticInstrCost(Instruction::Shl, RetTy, CostKind, 1903 TTI::OK_AnyValue, 1904 TTI::OK_UniformConstantValue); 1905 Cost += thisT()->getArithmeticInstrCost(Instruction::Or, RetTy, CostKind); 1906 return Cost; 1907 } 1908 case Intrinsic::sadd_with_overflow: 1909 case Intrinsic::ssub_with_overflow: { 1910 Type *SumTy = RetTy->getContainedType(0); 1911 Type *OverflowTy = RetTy->getContainedType(1); 1912 unsigned Opcode = IID == Intrinsic::sadd_with_overflow 1913 ? BinaryOperator::Add 1914 : BinaryOperator::Sub; 1915 1916 // Add: 1917 // Overflow -> (Result < LHS) ^ (RHS < 0) 1918 // Sub: 1919 // Overflow -> (Result < LHS) ^ (RHS > 0) 1920 InstructionCost Cost = 0; 1921 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1922 Cost += 2 * thisT()->getCmpSelInstrCost( 1923 Instruction::ICmp, SumTy, OverflowTy, 1924 CmpInst::ICMP_SGT, CostKind); 1925 Cost += thisT()->getArithmeticInstrCost(BinaryOperator::Xor, OverflowTy, 1926 CostKind); 1927 return Cost; 1928 } 1929 case Intrinsic::uadd_with_overflow: 1930 case Intrinsic::usub_with_overflow: { 1931 Type *SumTy = RetTy->getContainedType(0); 1932 Type *OverflowTy = RetTy->getContainedType(1); 1933 unsigned Opcode = IID == Intrinsic::uadd_with_overflow 1934 ? BinaryOperator::Add 1935 : BinaryOperator::Sub; 1936 CmpInst::Predicate Pred = IID == Intrinsic::uadd_with_overflow 1937 ? CmpInst::ICMP_ULT 1938 : CmpInst::ICMP_UGT; 1939 1940 InstructionCost Cost = 0; 1941 Cost += thisT()->getArithmeticInstrCost(Opcode, SumTy, CostKind); 1942 Cost += 1943 thisT()->getCmpSelInstrCost(BinaryOperator::ICmp, SumTy, OverflowTy, 1944 Pred, CostKind); 1945 return Cost; 1946 } 1947 case Intrinsic::smul_with_overflow: 1948 case Intrinsic::umul_with_overflow: { 1949 Type *MulTy = RetTy->getContainedType(0); 1950 Type *OverflowTy = RetTy->getContainedType(1); 1951 unsigned ExtSize = MulTy->getScalarSizeInBits() * 2; 1952 Type *ExtTy = MulTy->getWithNewBitWidth(ExtSize); 1953 bool IsSigned = IID == Intrinsic::smul_with_overflow; 1954 1955 unsigned ExtOp = IsSigned ? Instruction::SExt : Instruction::ZExt; 1956 TTI::CastContextHint CCH = TTI::CastContextHint::None; 1957 1958 InstructionCost Cost = 0; 1959 Cost += 2 * thisT()->getCastInstrCost(ExtOp, ExtTy, MulTy, CCH, CostKind); 1960 Cost += 1961 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 1962 Cost += 2 * thisT()->getCastInstrCost(Instruction::Trunc, MulTy, ExtTy, 1963 CCH, CostKind); 1964 Cost += thisT()->getArithmeticInstrCost(Instruction::LShr, ExtTy, 1965 CostKind, TTI::OK_AnyValue, 1966 TTI::OK_UniformConstantValue); 1967 1968 if (IsSigned) 1969 Cost += thisT()->getArithmeticInstrCost(Instruction::AShr, MulTy, 1970 CostKind, TTI::OK_AnyValue, 1971 TTI::OK_UniformConstantValue); 1972 1973 Cost += thisT()->getCmpSelInstrCost( 1974 BinaryOperator::ICmp, MulTy, OverflowTy, CmpInst::ICMP_NE, CostKind); 1975 return Cost; 1976 } 1977 case Intrinsic::fptosi_sat: 1978 case Intrinsic::fptoui_sat: { 1979 if (Tys.empty()) 1980 break; 1981 Type *FromTy = Tys[0]; 1982 bool IsSigned = IID == Intrinsic::fptosi_sat; 1983 1984 InstructionCost Cost = 0; 1985 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FromTy, 1986 {FromTy, FromTy}); 1987 Cost += thisT()->getIntrinsicInstrCost(Attrs1, CostKind); 1988 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FromTy, 1989 {FromTy, FromTy}); 1990 Cost += thisT()->getIntrinsicInstrCost(Attrs2, CostKind); 1991 Cost += thisT()->getCastInstrCost( 1992 IsSigned ? Instruction::FPToSI : Instruction::FPToUI, RetTy, FromTy, 1993 TTI::CastContextHint::None, CostKind); 1994 if (IsSigned) { 1995 Type *CondTy = RetTy->getWithNewBitWidth(1); 1996 Cost += thisT()->getCmpSelInstrCost( 1997 BinaryOperator::FCmp, FromTy, CondTy, CmpInst::FCMP_UNO, CostKind); 1998 Cost += thisT()->getCmpSelInstrCost( 1999 BinaryOperator::Select, RetTy, CondTy, CmpInst::FCMP_UNO, CostKind); 2000 } 2001 return Cost; 2002 } 2003 case Intrinsic::ctpop: 2004 ISD = ISD::CTPOP; 2005 // In case of legalization use TCC_Expensive. This is cheaper than a 2006 // library call but still not a cheap instruction. 2007 SingleCallCost = TargetTransformInfo::TCC_Expensive; 2008 break; 2009 case Intrinsic::ctlz: 2010 ISD = ISD::CTLZ; 2011 break; 2012 case Intrinsic::cttz: 2013 ISD = ISD::CTTZ; 2014 break; 2015 case Intrinsic::bswap: 2016 ISD = ISD::BSWAP; 2017 break; 2018 case Intrinsic::bitreverse: 2019 ISD = ISD::BITREVERSE; 2020 break; 2021 } 2022 2023 const TargetLoweringBase *TLI = getTLI(); 2024 std::pair<InstructionCost, MVT> LT = 2025 TLI->getTypeLegalizationCost(DL, RetTy); 2026 2027 if (TLI->isOperationLegalOrPromote(ISD, LT.second)) { 2028 if (IID == Intrinsic::fabs && LT.second.isFloatingPoint() && 2029 TLI->isFAbsFree(LT.second)) { 2030 return 0; 2031 } 2032 2033 // The operation is legal. Assume it costs 1. 2034 // If the type is split to multiple registers, assume that there is some 2035 // overhead to this. 2036 // TODO: Once we have extract/insert subvector cost we need to use them. 2037 if (LT.first > 1) 2038 return (LT.first * 2); 2039 else 2040 return (LT.first * 1); 2041 } else if (!TLI->isOperationExpand(ISD, LT.second)) { 2042 // If the operation is custom lowered then assume 2043 // that the code is twice as expensive. 2044 return (LT.first * 2); 2045 } 2046 2047 // If we can't lower fmuladd into an FMA estimate the cost as a floating 2048 // point mul followed by an add. 2049 if (IID == Intrinsic::fmuladd) 2050 return thisT()->getArithmeticInstrCost(BinaryOperator::FMul, RetTy, 2051 CostKind) + 2052 thisT()->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy, 2053 CostKind); 2054 if (IID == Intrinsic::experimental_constrained_fmuladd) { 2055 IntrinsicCostAttributes FMulAttrs( 2056 Intrinsic::experimental_constrained_fmul, RetTy, Tys); 2057 IntrinsicCostAttributes FAddAttrs( 2058 Intrinsic::experimental_constrained_fadd, RetTy, Tys); 2059 return thisT()->getIntrinsicInstrCost(FMulAttrs, CostKind) + 2060 thisT()->getIntrinsicInstrCost(FAddAttrs, CostKind); 2061 } 2062 2063 // Else, assume that we need to scalarize this intrinsic. For math builtins 2064 // this will emit a costly libcall, adding call overhead and spills. Make it 2065 // very expensive. 2066 if (auto *RetVTy = dyn_cast<VectorType>(RetTy)) { 2067 // Scalable vectors cannot be scalarized, so return Invalid. 2068 if (isa<ScalableVectorType>(RetTy) || any_of(Tys, [](const Type *Ty) { 2069 return isa<ScalableVectorType>(Ty); 2070 })) 2071 return InstructionCost::getInvalid(); 2072 2073 InstructionCost ScalarizationCost = 2074 SkipScalarizationCost ? ScalarizationCostPassed 2075 : getScalarizationOverhead(RetVTy, true, false); 2076 2077 unsigned ScalarCalls = cast<FixedVectorType>(RetVTy)->getNumElements(); 2078 SmallVector<Type *, 4> ScalarTys; 2079 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2080 Type *Ty = Tys[i]; 2081 if (Ty->isVectorTy()) 2082 Ty = Ty->getScalarType(); 2083 ScalarTys.push_back(Ty); 2084 } 2085 IntrinsicCostAttributes Attrs(IID, RetTy->getScalarType(), ScalarTys, FMF); 2086 InstructionCost ScalarCost = 2087 thisT()->getIntrinsicInstrCost(Attrs, CostKind); 2088 for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) { 2089 if (auto *VTy = dyn_cast<VectorType>(Tys[i])) { 2090 if (!ICA.skipScalarizationCost()) 2091 ScalarizationCost += getScalarizationOverhead(VTy, false, true); 2092 ScalarCalls = std::max(ScalarCalls, 2093 cast<FixedVectorType>(VTy)->getNumElements()); 2094 } 2095 } 2096 return ScalarCalls * ScalarCost + ScalarizationCost; 2097 } 2098 2099 // This is going to be turned into a library call, make it expensive. 2100 return SingleCallCost; 2101 } 2102 2103 /// Compute a cost of the given call instruction. 2104 /// 2105 /// Compute the cost of calling function F with return type RetTy and 2106 /// argument types Tys. F might be nullptr, in this case the cost of an 2107 /// arbitrary call with the specified signature will be returned. 2108 /// This is used, for instance, when we estimate call of a vector 2109 /// counterpart of the given function. 2110 /// \param F Called function, might be nullptr. 2111 /// \param RetTy Return value types. 2112 /// \param Tys Argument types. 2113 /// \returns The cost of Call instruction. 2114 InstructionCost getCallInstrCost(Function *F, Type *RetTy, 2115 ArrayRef<Type *> Tys, 2116 TTI::TargetCostKind CostKind) { 2117 return 10; 2118 } 2119 2120 unsigned getNumberOfParts(Type *Tp) { 2121 std::pair<InstructionCost, MVT> LT = 2122 getTLI()->getTypeLegalizationCost(DL, Tp); 2123 return LT.first.isValid() ? *LT.first.getValue() : 0; 2124 } 2125 2126 InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *, 2127 const SCEV *) { 2128 return 0; 2129 } 2130 2131 /// Try to calculate arithmetic and shuffle op costs for reduction intrinsics. 2132 /// We're assuming that reduction operation are performing the following way: 2133 /// 2134 /// %val1 = shufflevector<n x t> %val, <n x t> %undef, 2135 /// <n x i32> <i32 n/2, i32 n/2 + 1, ..., i32 n, i32 undef, ..., i32 undef> 2136 /// \----------------v-------------/ \----------v------------/ 2137 /// n/2 elements n/2 elements 2138 /// %red1 = op <n x t> %val, <n x t> val1 2139 /// After this operation we have a vector %red1 where only the first n/2 2140 /// elements are meaningful, the second n/2 elements are undefined and can be 2141 /// dropped. All other operations are actually working with the vector of 2142 /// length n/2, not n, though the real vector length is still n. 2143 /// %val2 = shufflevector<n x t> %red1, <n x t> %undef, 2144 /// <n x i32> <i32 n/4, i32 n/4 + 1, ..., i32 n/2, i32 undef, ..., i32 undef> 2145 /// \----------------v-------------/ \----------v------------/ 2146 /// n/4 elements 3*n/4 elements 2147 /// %red2 = op <n x t> %red1, <n x t> val2 - working with the vector of 2148 /// length n/2, the resulting vector has length n/4 etc. 2149 /// 2150 /// The cost model should take into account that the actual length of the 2151 /// vector is reduced on each iteration. 2152 InstructionCost getTreeReductionCost(unsigned Opcode, VectorType *Ty, 2153 TTI::TargetCostKind CostKind) { 2154 // Targets must implement a default value for the scalable case, since 2155 // we don't know how many lanes the vector has. 2156 if (isa<ScalableVectorType>(Ty)) 2157 return InstructionCost::getInvalid(); 2158 2159 Type *ScalarTy = Ty->getElementType(); 2160 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2161 if ((Opcode == Instruction::Or || Opcode == Instruction::And) && 2162 ScalarTy == IntegerType::getInt1Ty(Ty->getContext()) && 2163 NumVecElts >= 2) { 2164 // Or reduction for i1 is represented as: 2165 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2166 // %res = cmp ne iReduxWidth %val, 0 2167 // And reduction for i1 is represented as: 2168 // %val = bitcast <ReduxWidth x i1> to iReduxWidth 2169 // %res = cmp eq iReduxWidth %val, 11111 2170 Type *ValTy = IntegerType::get(Ty->getContext(), NumVecElts); 2171 return thisT()->getCastInstrCost(Instruction::BitCast, ValTy, Ty, 2172 TTI::CastContextHint::None, CostKind) + 2173 thisT()->getCmpSelInstrCost(Instruction::ICmp, ValTy, 2174 CmpInst::makeCmpResultType(ValTy), 2175 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2176 } 2177 unsigned NumReduxLevels = Log2_32(NumVecElts); 2178 InstructionCost ArithCost = 0; 2179 InstructionCost ShuffleCost = 0; 2180 std::pair<InstructionCost, MVT> LT = 2181 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2182 unsigned LongVectorCount = 0; 2183 unsigned MVTLen = 2184 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2185 while (NumVecElts > MVTLen) { 2186 NumVecElts /= 2; 2187 VectorType *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2188 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2189 NumVecElts, SubTy); 2190 ArithCost += thisT()->getArithmeticInstrCost(Opcode, SubTy, CostKind); 2191 Ty = SubTy; 2192 ++LongVectorCount; 2193 } 2194 2195 NumReduxLevels -= LongVectorCount; 2196 2197 // The minimal length of the vector is limited by the real length of vector 2198 // operations performed on the current platform. That's why several final 2199 // reduction operations are performed on the vectors with the same 2200 // architecture-dependent length. 2201 2202 // By default reductions need one shuffle per reduction level. 2203 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2204 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2205 ArithCost += 2206 NumReduxLevels * thisT()->getArithmeticInstrCost(Opcode, Ty, CostKind); 2207 return ShuffleCost + ArithCost + 2208 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2209 } 2210 2211 /// Try to calculate the cost of performing strict (in-order) reductions, 2212 /// which involves doing a sequence of floating point additions in lane 2213 /// order, starting with an initial value. For example, consider a scalar 2214 /// initial value 'InitVal' of type float and a vector of type <4 x float>: 2215 /// 2216 /// Vector = <float %v0, float %v1, float %v2, float %v3> 2217 /// 2218 /// %add1 = %InitVal + %v0 2219 /// %add2 = %add1 + %v1 2220 /// %add3 = %add2 + %v2 2221 /// %add4 = %add3 + %v3 2222 /// 2223 /// As a simple estimate we can say the cost of such a reduction is 4 times 2224 /// the cost of a scalar FP addition. We can only estimate the costs for 2225 /// fixed-width vectors here because for scalable vectors we do not know the 2226 /// runtime number of operations. 2227 InstructionCost getOrderedReductionCost(unsigned Opcode, VectorType *Ty, 2228 TTI::TargetCostKind CostKind) { 2229 // Targets must implement a default value for the scalable case, since 2230 // we don't know how many lanes the vector has. 2231 if (isa<ScalableVectorType>(Ty)) 2232 return InstructionCost::getInvalid(); 2233 2234 auto *VTy = cast<FixedVectorType>(Ty); 2235 InstructionCost ExtractCost = 2236 getScalarizationOverhead(VTy, /*Insert=*/false, /*Extract=*/true); 2237 InstructionCost ArithCost = thisT()->getArithmeticInstrCost( 2238 Opcode, VTy->getElementType(), CostKind); 2239 ArithCost *= VTy->getNumElements(); 2240 2241 return ExtractCost + ArithCost; 2242 } 2243 2244 InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, 2245 Optional<FastMathFlags> FMF, 2246 TTI::TargetCostKind CostKind) { 2247 if (TTI::requiresOrderedReduction(FMF)) 2248 return getOrderedReductionCost(Opcode, Ty, CostKind); 2249 return getTreeReductionCost(Opcode, Ty, CostKind); 2250 } 2251 2252 /// Try to calculate op costs for min/max reduction operations. 2253 /// \param CondTy Conditional type for the Select instruction. 2254 InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, 2255 bool IsUnsigned, 2256 TTI::TargetCostKind CostKind) { 2257 // Targets must implement a default value for the scalable case, since 2258 // we don't know how many lanes the vector has. 2259 if (isa<ScalableVectorType>(Ty)) 2260 return InstructionCost::getInvalid(); 2261 2262 Type *ScalarTy = Ty->getElementType(); 2263 Type *ScalarCondTy = CondTy->getElementType(); 2264 unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements(); 2265 unsigned NumReduxLevels = Log2_32(NumVecElts); 2266 unsigned CmpOpcode; 2267 if (Ty->isFPOrFPVectorTy()) { 2268 CmpOpcode = Instruction::FCmp; 2269 } else { 2270 assert(Ty->isIntOrIntVectorTy() && 2271 "expecting floating point or integer type for min/max reduction"); 2272 CmpOpcode = Instruction::ICmp; 2273 } 2274 InstructionCost MinMaxCost = 0; 2275 InstructionCost ShuffleCost = 0; 2276 std::pair<InstructionCost, MVT> LT = 2277 thisT()->getTLI()->getTypeLegalizationCost(DL, Ty); 2278 unsigned LongVectorCount = 0; 2279 unsigned MVTLen = 2280 LT.second.isVector() ? LT.second.getVectorNumElements() : 1; 2281 while (NumVecElts > MVTLen) { 2282 NumVecElts /= 2; 2283 auto *SubTy = FixedVectorType::get(ScalarTy, NumVecElts); 2284 CondTy = FixedVectorType::get(ScalarCondTy, NumVecElts); 2285 2286 ShuffleCost += thisT()->getShuffleCost(TTI::SK_ExtractSubvector, Ty, None, 2287 NumVecElts, SubTy); 2288 MinMaxCost += 2289 thisT()->getCmpSelInstrCost(CmpOpcode, SubTy, CondTy, 2290 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2291 thisT()->getCmpSelInstrCost(Instruction::Select, SubTy, CondTy, 2292 CmpInst::BAD_ICMP_PREDICATE, CostKind); 2293 Ty = SubTy; 2294 ++LongVectorCount; 2295 } 2296 2297 NumReduxLevels -= LongVectorCount; 2298 2299 // The minimal length of the vector is limited by the real length of vector 2300 // operations performed on the current platform. That's why several final 2301 // reduction opertions are perfomed on the vectors with the same 2302 // architecture-dependent length. 2303 ShuffleCost += NumReduxLevels * thisT()->getShuffleCost( 2304 TTI::SK_PermuteSingleSrc, Ty, None, 0, Ty); 2305 MinMaxCost += 2306 NumReduxLevels * 2307 (thisT()->getCmpSelInstrCost(CmpOpcode, Ty, CondTy, 2308 CmpInst::BAD_ICMP_PREDICATE, CostKind) + 2309 thisT()->getCmpSelInstrCost(Instruction::Select, Ty, CondTy, 2310 CmpInst::BAD_ICMP_PREDICATE, CostKind)); 2311 // The last min/max should be in vector registers and we counted it above. 2312 // So just need a single extractelement. 2313 return ShuffleCost + MinMaxCost + 2314 thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); 2315 } 2316 2317 InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, 2318 Type *ResTy, VectorType *Ty, 2319 TTI::TargetCostKind CostKind) { 2320 // Without any native support, this is equivalent to the cost of 2321 // vecreduce.add(ext) or if IsMLA vecreduce.add(mul(ext, ext)) 2322 VectorType *ExtTy = VectorType::get(ResTy, Ty); 2323 InstructionCost RedCost = thisT()->getArithmeticReductionCost( 2324 Instruction::Add, ExtTy, None, CostKind); 2325 InstructionCost MulCost = 0; 2326 InstructionCost ExtCost = thisT()->getCastInstrCost( 2327 IsUnsigned ? Instruction::ZExt : Instruction::SExt, ExtTy, Ty, 2328 TTI::CastContextHint::None, CostKind); 2329 if (IsMLA) { 2330 MulCost = 2331 thisT()->getArithmeticInstrCost(Instruction::Mul, ExtTy, CostKind); 2332 ExtCost *= 2; 2333 } 2334 2335 return RedCost + MulCost + ExtCost; 2336 } 2337 2338 InstructionCost getVectorSplitCost() { return 1; } 2339 2340 /// @} 2341 }; 2342 2343 /// Concrete BasicTTIImpl that can be used if no further customization 2344 /// is needed. 2345 class BasicTTIImpl : public BasicTTIImplBase<BasicTTIImpl> { 2346 using BaseT = BasicTTIImplBase<BasicTTIImpl>; 2347 2348 friend class BasicTTIImplBase<BasicTTIImpl>; 2349 2350 const TargetSubtargetInfo *ST; 2351 const TargetLoweringBase *TLI; 2352 2353 const TargetSubtargetInfo *getST() const { return ST; } 2354 const TargetLoweringBase *getTLI() const { return TLI; } 2355 2356 public: 2357 explicit BasicTTIImpl(const TargetMachine *TM, const Function &F); 2358 }; 2359 2360 } // end namespace llvm 2361 2362 #endif // LLVM_CODEGEN_BASICTTIIMPL_H 2363