1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 #define GET_SUBTARGETINFO_HEADER 25 #include "AMDGPUGenSubtargetInfo.inc" 26 27 namespace llvm { 28 29 class GCNTargetMachine; 30 31 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 32 public AMDGPUSubtarget { 33 public: 34 using AMDGPUSubtarget::getMaxWavesPerEU; 35 36 // Following 2 enums are documented at: 37 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 38 enum class TrapHandlerAbi { 39 NONE = 0x00, 40 AMDHSA = 0x01, 41 }; 42 43 enum class TrapID { 44 LLVMAMDHSATrap = 0x02, 45 LLVMAMDHSADebugTrap = 0x03, 46 }; 47 48 private: 49 /// GlobalISel related APIs. 50 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 51 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 52 std::unique_ptr<InstructionSelector> InstSelector; 53 std::unique_ptr<LegalizerInfo> Legalizer; 54 std::unique_ptr<RegisterBankInfo> RegBankInfo; 55 56 protected: 57 // Basic subtarget description. 58 Triple TargetTriple; 59 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 60 unsigned Gen = INVALID; 61 InstrItineraryData InstrItins; 62 int LDSBankCount = 0; 63 unsigned MaxPrivateElementSize = 0; 64 65 // Possibly statically set by tablegen, but may want to be overridden. 66 bool FastFMAF32 = false; 67 bool FastDenormalF32 = false; 68 bool HalfRate64Ops = false; 69 bool FullRate64Ops = false; 70 71 // Dynamically set bits that enable features. 72 bool FlatForGlobal = false; 73 bool AutoWaitcntBeforeBarrier = false; 74 bool BackOffBarrier = false; 75 bool UnalignedScratchAccess = false; 76 bool UnalignedAccessMode = false; 77 bool HasApertureRegs = false; 78 bool SupportsXNACK = false; 79 80 // This should not be used directly. 'TargetID' tracks the dynamic settings 81 // for XNACK. 82 bool EnableXNACK = false; 83 84 bool EnableTgSplit = false; 85 bool EnableCuMode = false; 86 bool TrapHandler = false; 87 88 // Used as options. 89 bool EnableLoadStoreOpt = false; 90 bool EnableUnsafeDSOffsetFolding = false; 91 bool EnableSIScheduler = false; 92 bool EnableDS128 = false; 93 bool EnablePRTStrictNull = false; 94 bool DumpCode = false; 95 96 // Subtarget statically properties set by tablegen 97 bool FP64 = false; 98 bool FMA = false; 99 bool MIMG_R128 = false; 100 bool CIInsts = false; 101 bool GFX8Insts = false; 102 bool GFX9Insts = false; 103 bool GFX90AInsts = false; 104 bool GFX940Insts = false; 105 bool GFX10Insts = false; 106 bool GFX11Insts = false; 107 bool GFX10_3Insts = false; 108 bool GFX7GFX8GFX9Insts = false; 109 bool SGPRInitBug = false; 110 bool UserSGPRInit16Bug = false; 111 bool NegativeScratchOffsetBug = false; 112 bool NegativeUnalignedScratchOffsetBug = false; 113 bool HasSMemRealTime = false; 114 bool HasIntClamp = false; 115 bool HasFmaMixInsts = false; 116 bool HasMovrel = false; 117 bool HasVGPRIndexMode = false; 118 bool HasScalarStores = false; 119 bool HasScalarAtomics = false; 120 bool HasSDWAOmod = false; 121 bool HasSDWAScalar = false; 122 bool HasSDWASdst = false; 123 bool HasSDWAMac = false; 124 bool HasSDWAOutModsVOPC = false; 125 bool HasDPP = false; 126 bool HasDPP8 = false; 127 bool Has64BitDPP = false; 128 bool HasPackedFP32Ops = false; 129 bool HasImageInsts = false; 130 bool HasExtendedImageInsts = false; 131 bool HasR128A16 = false; 132 bool HasA16 = false; 133 bool HasG16 = false; 134 bool HasNSAEncoding = false; 135 unsigned NSAMaxSize = 0; 136 bool GFX10_AEncoding = false; 137 bool GFX10_BEncoding = false; 138 bool HasDLInsts = false; 139 bool HasFmacF64Inst = false; 140 bool HasDot1Insts = false; 141 bool HasDot2Insts = false; 142 bool HasDot3Insts = false; 143 bool HasDot4Insts = false; 144 bool HasDot5Insts = false; 145 bool HasDot6Insts = false; 146 bool HasDot7Insts = false; 147 bool HasDot8Insts = false; 148 bool HasDot9Insts = false; 149 bool HasMAIInsts = false; 150 bool HasFP8Insts = false; 151 bool HasPkFmacF16Inst = false; 152 bool HasAtomicFaddRtnInsts = false; 153 bool HasAtomicFaddNoRtnInsts = false; 154 bool HasAtomicPkFaddNoRtnInsts = false; 155 bool HasFlatAtomicFaddF32Inst = false; 156 bool SupportsSRAMECC = false; 157 158 // This should not be used directly. 'TargetID' tracks the dynamic settings 159 // for SRAMECC. 160 bool EnableSRAMECC = false; 161 162 bool HasNoSdstCMPX = false; 163 bool HasVscnt = false; 164 bool HasGetWaveIdInst = false; 165 bool HasSMemTimeInst = false; 166 bool HasShaderCyclesRegister = false; 167 bool HasVOP3Literal = false; 168 bool HasNoDataDepHazard = false; 169 bool FlatAddressSpace = false; 170 bool FlatInstOffsets = false; 171 bool FlatGlobalInsts = false; 172 bool FlatScratchInsts = false; 173 bool ScalarFlatScratchInsts = false; 174 bool HasArchitectedFlatScratch = false; 175 bool EnableFlatScratch = false; 176 bool AddNoCarryInsts = false; 177 bool HasUnpackedD16VMem = false; 178 bool LDSMisalignedBug = false; 179 bool HasMFMAInlineLiteralBug = false; 180 bool UnalignedBufferAccess = false; 181 bool UnalignedDSAccess = false; 182 bool HasPackedTID = false; 183 bool ScalarizeGlobal = false; 184 185 bool HasVcmpxPermlaneHazard = false; 186 bool HasVMEMtoScalarWriteHazard = false; 187 bool HasSMEMtoVectorWriteHazard = false; 188 bool HasInstFwdPrefetchBug = false; 189 bool HasVcmpxExecWARHazard = false; 190 bool HasLdsBranchVmemWARHazard = false; 191 bool HasNSAtoVMEMBug = false; 192 bool HasNSAClauseBug = false; 193 bool HasOffset3fBug = false; 194 bool HasFlatSegmentOffsetBug = false; 195 bool HasImageStoreD16Bug = false; 196 bool HasImageGather4D16Bug = false; 197 bool HasGFX11FullVGPRs = false; 198 bool HasMADIntraFwdBug = false; 199 bool HasVOPDInsts = false; 200 bool HasVALUTransUseHazard = false; 201 202 // Dummy feature to use for assembler in tablegen. 203 bool FeatureDisable = false; 204 205 SelectionDAGTargetInfo TSInfo; 206 private: 207 SIInstrInfo InstrInfo; 208 SITargetLowering TLInfo; 209 SIFrameLowering FrameLowering; 210 211 public: 212 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 213 const GCNTargetMachine &TM); 214 ~GCNSubtarget() override; 215 216 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 217 StringRef GPU, StringRef FS); 218 getInstrInfo()219 const SIInstrInfo *getInstrInfo() const override { 220 return &InstrInfo; 221 } 222 getFrameLowering()223 const SIFrameLowering *getFrameLowering() const override { 224 return &FrameLowering; 225 } 226 getTargetLowering()227 const SITargetLowering *getTargetLowering() const override { 228 return &TLInfo; 229 } 230 getRegisterInfo()231 const SIRegisterInfo *getRegisterInfo() const override { 232 return &InstrInfo.getRegisterInfo(); 233 } 234 getCallLowering()235 const CallLowering *getCallLowering() const override { 236 return CallLoweringInfo.get(); 237 } 238 getInlineAsmLowering()239 const InlineAsmLowering *getInlineAsmLowering() const override { 240 return InlineAsmLoweringInfo.get(); 241 } 242 getInstructionSelector()243 InstructionSelector *getInstructionSelector() const override { 244 return InstSelector.get(); 245 } 246 getLegalizerInfo()247 const LegalizerInfo *getLegalizerInfo() const override { 248 return Legalizer.get(); 249 } 250 getRegBankInfo()251 const RegisterBankInfo *getRegBankInfo() const override { 252 return RegBankInfo.get(); 253 } 254 getTargetID()255 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 256 return TargetID; 257 } 258 259 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()260 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 261 return &TSInfo; 262 } 263 getInstrItineraryData()264 const InstrItineraryData *getInstrItineraryData() const override { 265 return &InstrItins; 266 } 267 268 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 269 getGeneration()270 Generation getGeneration() const { 271 return (Generation)Gen; 272 } 273 getMaxWaveScratchSize()274 unsigned getMaxWaveScratchSize() const { 275 // See COMPUTE_TMPRING_SIZE.WAVESIZE. 276 if (getGeneration() < GFX11) { 277 // 13-bit field in units of 256-dword. 278 return (256 * 4) * ((1 << 13) - 1); 279 } 280 // 15-bit field in units of 64-dword. 281 return (64 * 4) * ((1 << 15) - 1); 282 } 283 284 /// Return the number of high bits known to be zero for a frame index. getKnownHighZeroBitsForFrameIndex()285 unsigned getKnownHighZeroBitsForFrameIndex() const { 286 return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2(); 287 } 288 getLDSBankCount()289 int getLDSBankCount() const { 290 return LDSBankCount; 291 } 292 293 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 294 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 295 } 296 297 unsigned getConstantBusLimit(unsigned Opcode) const; 298 299 /// Returns if the result of this instruction with a 16-bit result returned in 300 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve 301 /// the original value. 302 bool zeroesHigh16BitsOfDest(unsigned Opcode) const; 303 supportsWGP()304 bool supportsWGP() const { return getGeneration() >= GFX10; } 305 hasIntClamp()306 bool hasIntClamp() const { 307 return HasIntClamp; 308 } 309 hasFP64()310 bool hasFP64() const { 311 return FP64; 312 } 313 hasMIMG_R128()314 bool hasMIMG_R128() const { 315 return MIMG_R128; 316 } 317 hasHWFP64()318 bool hasHWFP64() const { 319 return FP64; 320 } 321 hasFastFMAF32()322 bool hasFastFMAF32() const { 323 return FastFMAF32; 324 } 325 hasHalfRate64Ops()326 bool hasHalfRate64Ops() const { 327 return HalfRate64Ops; 328 } 329 hasFullRate64Ops()330 bool hasFullRate64Ops() const { 331 return FullRate64Ops; 332 } 333 hasAddr64()334 bool hasAddr64() const { 335 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 336 } 337 hasFlat()338 bool hasFlat() const { 339 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 340 } 341 342 // Return true if the target only has the reverse operand versions of VALU 343 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). hasOnlyRevVALUShifts()344 bool hasOnlyRevVALUShifts() const { 345 return getGeneration() >= VOLCANIC_ISLANDS; 346 } 347 hasFractBug()348 bool hasFractBug() const { 349 return getGeneration() == SOUTHERN_ISLANDS; 350 } 351 hasBFE()352 bool hasBFE() const { 353 return true; 354 } 355 hasBFI()356 bool hasBFI() const { 357 return true; 358 } 359 hasBFM()360 bool hasBFM() const { 361 return hasBFE(); 362 } 363 hasBCNT(unsigned Size)364 bool hasBCNT(unsigned Size) const { 365 return true; 366 } 367 hasFFBL()368 bool hasFFBL() const { 369 return true; 370 } 371 hasFFBH()372 bool hasFFBH() const { 373 return true; 374 } 375 hasMed3_16()376 bool hasMed3_16() const { 377 return getGeneration() >= AMDGPUSubtarget::GFX9; 378 } 379 hasMin3Max3_16()380 bool hasMin3Max3_16() const { 381 return getGeneration() >= AMDGPUSubtarget::GFX9; 382 } 383 hasFmaMixInsts()384 bool hasFmaMixInsts() const { 385 return HasFmaMixInsts; 386 } 387 hasCARRY()388 bool hasCARRY() const { 389 return true; 390 } 391 hasFMA()392 bool hasFMA() const { 393 return FMA; 394 } 395 hasSwap()396 bool hasSwap() const { 397 return GFX9Insts; 398 } 399 hasScalarPackInsts()400 bool hasScalarPackInsts() const { 401 return GFX9Insts; 402 } 403 hasScalarMulHiInsts()404 bool hasScalarMulHiInsts() const { 405 return GFX9Insts; 406 } 407 getTrapHandlerAbi()408 TrapHandlerAbi getTrapHandlerAbi() const { 409 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 410 } 411 supportsGetDoorbellID()412 bool supportsGetDoorbellID() const { 413 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 414 return getGeneration() >= GFX9; 415 } 416 417 /// True if the offset field of DS instructions works as expected. On SI, the 418 /// offset uses a 16-bit adder and does not always wrap properly. hasUsableDSOffset()419 bool hasUsableDSOffset() const { 420 return getGeneration() >= SEA_ISLANDS; 421 } 422 unsafeDSOffsetFoldingEnabled()423 bool unsafeDSOffsetFoldingEnabled() const { 424 return EnableUnsafeDSOffsetFolding; 425 } 426 427 /// Condition output from div_scale is usable. hasUsableDivScaleConditionOutput()428 bool hasUsableDivScaleConditionOutput() const { 429 return getGeneration() != SOUTHERN_ISLANDS; 430 } 431 432 /// Extra wait hazard is needed in some cases before 433 /// s_cbranch_vccnz/s_cbranch_vccz. hasReadVCCZBug()434 bool hasReadVCCZBug() const { 435 return getGeneration() <= SEA_ISLANDS; 436 } 437 438 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. partialVCCWritesUpdateVCCZ()439 bool partialVCCWritesUpdateVCCZ() const { 440 return getGeneration() >= GFX10; 441 } 442 443 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 444 /// was written by a VALU instruction. hasSMRDReadVALUDefHazard()445 bool hasSMRDReadVALUDefHazard() const { 446 return getGeneration() == SOUTHERN_ISLANDS; 447 } 448 449 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 450 /// SGPR was written by a VALU Instruction. hasVMEMReadSGPRVALUDefHazard()451 bool hasVMEMReadSGPRVALUDefHazard() const { 452 return getGeneration() >= VOLCANIC_ISLANDS; 453 } 454 hasRFEHazards()455 bool hasRFEHazards() const { 456 return getGeneration() >= VOLCANIC_ISLANDS; 457 } 458 459 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. getSetRegWaitStates()460 unsigned getSetRegWaitStates() const { 461 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 462 } 463 dumpCode()464 bool dumpCode() const { 465 return DumpCode; 466 } 467 468 /// Return the amount of LDS that can be used that will not restrict the 469 /// occupancy lower than WaveCount. 470 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 471 const Function &) const; 472 supportsMinMaxDenormModes()473 bool supportsMinMaxDenormModes() const { 474 return getGeneration() >= AMDGPUSubtarget::GFX9; 475 } 476 477 /// \returns If target supports S_DENORM_MODE. hasDenormModeInst()478 bool hasDenormModeInst() const { 479 return getGeneration() >= AMDGPUSubtarget::GFX10; 480 } 481 useFlatForGlobal()482 bool useFlatForGlobal() const { 483 return FlatForGlobal; 484 } 485 486 /// \returns If target supports ds_read/write_b128 and user enables generation 487 /// of ds_read/write_b128. useDS128()488 bool useDS128() const { 489 return CIInsts && EnableDS128; 490 } 491 492 /// \return If target supports ds_read/write_b96/128. hasDS96AndDS128()493 bool hasDS96AndDS128() const { 494 return CIInsts; 495 } 496 497 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 haveRoundOpsF64()498 bool haveRoundOpsF64() const { 499 return CIInsts; 500 } 501 502 /// \returns If MUBUF instructions always perform range checking, even for 503 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()504 bool privateMemoryResourceIsRangeChecked() const { 505 return getGeneration() < AMDGPUSubtarget::GFX9; 506 } 507 508 /// \returns If target requires PRT Struct NULL support (zero result registers 509 /// for sparse texture support). usePRTStrictNull()510 bool usePRTStrictNull() const { 511 return EnablePRTStrictNull; 512 } 513 hasAutoWaitcntBeforeBarrier()514 bool hasAutoWaitcntBeforeBarrier() const { 515 return AutoWaitcntBeforeBarrier; 516 } 517 518 /// \returns true if the target supports backing off of s_barrier instructions 519 /// when an exception is raised. supportsBackOffBarrier()520 bool supportsBackOffBarrier() const { 521 return BackOffBarrier; 522 } 523 hasUnalignedBufferAccess()524 bool hasUnalignedBufferAccess() const { 525 return UnalignedBufferAccess; 526 } 527 hasUnalignedBufferAccessEnabled()528 bool hasUnalignedBufferAccessEnabled() const { 529 return UnalignedBufferAccess && UnalignedAccessMode; 530 } 531 hasUnalignedDSAccess()532 bool hasUnalignedDSAccess() const { 533 return UnalignedDSAccess; 534 } 535 hasUnalignedDSAccessEnabled()536 bool hasUnalignedDSAccessEnabled() const { 537 return UnalignedDSAccess && UnalignedAccessMode; 538 } 539 hasUnalignedScratchAccess()540 bool hasUnalignedScratchAccess() const { 541 return UnalignedScratchAccess; 542 } 543 hasUnalignedAccessMode()544 bool hasUnalignedAccessMode() const { 545 return UnalignedAccessMode; 546 } 547 hasApertureRegs()548 bool hasApertureRegs() const { 549 return HasApertureRegs; 550 } 551 isTrapHandlerEnabled()552 bool isTrapHandlerEnabled() const { 553 return TrapHandler; 554 } 555 isXNACKEnabled()556 bool isXNACKEnabled() const { 557 return TargetID.isXnackOnOrAny(); 558 } 559 isTgSplitEnabled()560 bool isTgSplitEnabled() const { 561 return EnableTgSplit; 562 } 563 isCuModeEnabled()564 bool isCuModeEnabled() const { 565 return EnableCuMode; 566 } 567 hasFlatAddressSpace()568 bool hasFlatAddressSpace() const { 569 return FlatAddressSpace; 570 } 571 hasFlatScrRegister()572 bool hasFlatScrRegister() const { 573 return hasFlatAddressSpace(); 574 } 575 hasFlatInstOffsets()576 bool hasFlatInstOffsets() const { 577 return FlatInstOffsets; 578 } 579 hasFlatGlobalInsts()580 bool hasFlatGlobalInsts() const { 581 return FlatGlobalInsts; 582 } 583 hasFlatScratchInsts()584 bool hasFlatScratchInsts() const { 585 return FlatScratchInsts; 586 } 587 588 // Check if target supports ST addressing mode with FLAT scratch instructions. 589 // The ST addressing mode means no registers are used, either VGPR or SGPR, 590 // but only immediate offset is swizzled and added to the FLAT scratch base. hasFlatScratchSTMode()591 bool hasFlatScratchSTMode() const { 592 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts()); 593 } 594 hasFlatScratchSVSMode()595 bool hasFlatScratchSVSMode() const { return GFX940Insts || GFX11Insts; } 596 hasScalarFlatScratchInsts()597 bool hasScalarFlatScratchInsts() const { 598 return ScalarFlatScratchInsts; 599 } 600 enableFlatScratch()601 bool enableFlatScratch() const { 602 return flatScratchIsArchitected() || 603 (EnableFlatScratch && hasFlatScratchInsts()); 604 } 605 hasGlobalAddTidInsts()606 bool hasGlobalAddTidInsts() const { 607 return GFX10_BEncoding; 608 } 609 hasAtomicCSub()610 bool hasAtomicCSub() const { 611 return GFX10_BEncoding; 612 } 613 hasMultiDwordFlatScratchAddressing()614 bool hasMultiDwordFlatScratchAddressing() const { 615 return getGeneration() >= GFX9; 616 } 617 hasFlatSegmentOffsetBug()618 bool hasFlatSegmentOffsetBug() const { 619 return HasFlatSegmentOffsetBug; 620 } 621 hasFlatLgkmVMemCountInOrder()622 bool hasFlatLgkmVMemCountInOrder() const { 623 return getGeneration() > GFX9; 624 } 625 hasD16LoadStore()626 bool hasD16LoadStore() const { 627 return getGeneration() >= GFX9; 628 } 629 d16PreservesUnusedBits()630 bool d16PreservesUnusedBits() const { 631 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 632 } 633 hasD16Images()634 bool hasD16Images() const { 635 return getGeneration() >= VOLCANIC_ISLANDS; 636 } 637 638 /// Return if most LDS instructions have an m0 use that require m0 to be 639 /// initialized. ldsRequiresM0Init()640 bool ldsRequiresM0Init() const { 641 return getGeneration() < GFX9; 642 } 643 644 // True if the hardware rewinds and replays GWS operations if a wave is 645 // preempted. 646 // 647 // If this is false, a GWS operation requires testing if a nack set the 648 // MEM_VIOL bit, and repeating if so. hasGWSAutoReplay()649 bool hasGWSAutoReplay() const { 650 return getGeneration() >= GFX9; 651 } 652 653 /// \returns if target has ds_gws_sema_release_all instruction. hasGWSSemaReleaseAll()654 bool hasGWSSemaReleaseAll() const { 655 return CIInsts; 656 } 657 658 /// \returns true if the target has integer add/sub instructions that do not 659 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 660 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 661 /// for saturation. hasAddNoCarry()662 bool hasAddNoCarry() const { 663 return AddNoCarryInsts; 664 } 665 hasUnpackedD16VMem()666 bool hasUnpackedD16VMem() const { 667 return HasUnpackedD16VMem; 668 } 669 670 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)671 bool isMesaGfxShader(const Function &F) const { 672 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 673 } 674 hasMad64_32()675 bool hasMad64_32() const { 676 return getGeneration() >= SEA_ISLANDS; 677 } 678 hasSDWAOmod()679 bool hasSDWAOmod() const { 680 return HasSDWAOmod; 681 } 682 hasSDWAScalar()683 bool hasSDWAScalar() const { 684 return HasSDWAScalar; 685 } 686 hasSDWASdst()687 bool hasSDWASdst() const { 688 return HasSDWASdst; 689 } 690 hasSDWAMac()691 bool hasSDWAMac() const { 692 return HasSDWAMac; 693 } 694 hasSDWAOutModsVOPC()695 bool hasSDWAOutModsVOPC() const { 696 return HasSDWAOutModsVOPC; 697 } 698 hasDLInsts()699 bool hasDLInsts() const { 700 return HasDLInsts; 701 } 702 hasFmacF64Inst()703 bool hasFmacF64Inst() const { return HasFmacF64Inst; } 704 hasDot1Insts()705 bool hasDot1Insts() const { 706 return HasDot1Insts; 707 } 708 hasDot2Insts()709 bool hasDot2Insts() const { 710 return HasDot2Insts; 711 } 712 hasDot3Insts()713 bool hasDot3Insts() const { 714 return HasDot3Insts; 715 } 716 hasDot4Insts()717 bool hasDot4Insts() const { 718 return HasDot4Insts; 719 } 720 hasDot5Insts()721 bool hasDot5Insts() const { 722 return HasDot5Insts; 723 } 724 hasDot6Insts()725 bool hasDot6Insts() const { 726 return HasDot6Insts; 727 } 728 hasDot7Insts()729 bool hasDot7Insts() const { 730 return HasDot7Insts; 731 } 732 hasDot8Insts()733 bool hasDot8Insts() const { 734 return HasDot8Insts; 735 } 736 hasDot9Insts()737 bool hasDot9Insts() const { 738 return HasDot9Insts; 739 } 740 hasMAIInsts()741 bool hasMAIInsts() const { 742 return HasMAIInsts; 743 } 744 hasFP8Insts()745 bool hasFP8Insts() const { 746 return HasFP8Insts; 747 } 748 hasPkFmacF16Inst()749 bool hasPkFmacF16Inst() const { 750 return HasPkFmacF16Inst; 751 } 752 hasAtomicFaddInsts()753 bool hasAtomicFaddInsts() const { 754 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts; 755 } 756 hasAtomicFaddRtnInsts()757 bool hasAtomicFaddRtnInsts() const { return HasAtomicFaddRtnInsts; } 758 hasAtomicFaddNoRtnInsts()759 bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; } 760 hasAtomicPkFaddNoRtnInsts()761 bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; } 762 hasFlatAtomicFaddF32Inst()763 bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; } 764 hasNoSdstCMPX()765 bool hasNoSdstCMPX() const { 766 return HasNoSdstCMPX; 767 } 768 hasVscnt()769 bool hasVscnt() const { 770 return HasVscnt; 771 } 772 hasGetWaveIdInst()773 bool hasGetWaveIdInst() const { 774 return HasGetWaveIdInst; 775 } 776 hasSMemTimeInst()777 bool hasSMemTimeInst() const { 778 return HasSMemTimeInst; 779 } 780 hasShaderCyclesRegister()781 bool hasShaderCyclesRegister() const { 782 return HasShaderCyclesRegister; 783 } 784 hasVOP3Literal()785 bool hasVOP3Literal() const { 786 return HasVOP3Literal; 787 } 788 hasNoDataDepHazard()789 bool hasNoDataDepHazard() const { 790 return HasNoDataDepHazard; 791 } 792 vmemWriteNeedsExpWaitcnt()793 bool vmemWriteNeedsExpWaitcnt() const { 794 return getGeneration() < SEA_ISLANDS; 795 } 796 hasInstPrefetch()797 bool hasInstPrefetch() const { return getGeneration() >= GFX10; } 798 799 // Scratch is allocated in 256 dword per wave blocks for the entire 800 // wavefront. When viewed from the perspective of an arbitrary workitem, this 801 // is 4-byte aligned. 802 // 803 // Only 4-byte alignment is really needed to access anything. Transformations 804 // on the pointer value itself may rely on the alignment / known low bits of 805 // the pointer. Set this to something above the minimum to avoid needing 806 // dynamic realignment in common cases. getStackAlignment()807 Align getStackAlignment() const { return Align(16); } 808 enableMachineScheduler()809 bool enableMachineScheduler() const override { 810 return true; 811 } 812 813 bool useAA() const override; 814 enableSubRegLiveness()815 bool enableSubRegLiveness() const override { 816 return true; 817 } 818 setScalarizeGlobalBehavior(bool b)819 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()820 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 821 822 // static wrappers 823 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 824 825 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()826 bool enableEarlyIfConversion() const override { 827 return true; 828 } 829 830 void overrideSchedPolicy(MachineSchedPolicy &Policy, 831 unsigned NumRegionInstrs) const override; 832 getMaxNumUserSGPRs()833 unsigned getMaxNumUserSGPRs() const { 834 return 16; 835 } 836 hasSMemRealTime()837 bool hasSMemRealTime() const { 838 return HasSMemRealTime; 839 } 840 hasMovrel()841 bool hasMovrel() const { 842 return HasMovrel; 843 } 844 hasVGPRIndexMode()845 bool hasVGPRIndexMode() const { 846 return HasVGPRIndexMode; 847 } 848 849 bool useVGPRIndexMode() const; 850 hasScalarCompareEq64()851 bool hasScalarCompareEq64() const { 852 return getGeneration() >= VOLCANIC_ISLANDS; 853 } 854 hasScalarStores()855 bool hasScalarStores() const { 856 return HasScalarStores; 857 } 858 hasScalarAtomics()859 bool hasScalarAtomics() const { 860 return HasScalarAtomics; 861 } 862 hasLDSFPAtomicAdd()863 bool hasLDSFPAtomicAdd() const { return GFX8Insts; } 864 865 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. hasPermLaneX16()866 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 867 868 /// \returns true if the subtarget has the v_permlane64_b32 instruction. hasPermLane64()869 bool hasPermLane64() const { return getGeneration() >= GFX11; } 870 hasDPP()871 bool hasDPP() const { 872 return HasDPP; 873 } 874 hasDPPBroadcasts()875 bool hasDPPBroadcasts() const { 876 return HasDPP && getGeneration() < GFX10; 877 } 878 hasDPPWavefrontShifts()879 bool hasDPPWavefrontShifts() const { 880 return HasDPP && getGeneration() < GFX10; 881 } 882 hasDPP8()883 bool hasDPP8() const { 884 return HasDPP8; 885 } 886 has64BitDPP()887 bool has64BitDPP() const { 888 return Has64BitDPP; 889 } 890 hasPackedFP32Ops()891 bool hasPackedFP32Ops() const { 892 return HasPackedFP32Ops; 893 } 894 hasFmaakFmamkF32Insts()895 bool hasFmaakFmamkF32Insts() const { 896 return getGeneration() >= GFX10 || hasGFX940Insts(); 897 } 898 hasImageInsts()899 bool hasImageInsts() const { 900 return HasImageInsts; 901 } 902 hasExtendedImageInsts()903 bool hasExtendedImageInsts() const { 904 return HasExtendedImageInsts; 905 } 906 hasR128A16()907 bool hasR128A16() const { 908 return HasR128A16; 909 } 910 hasA16()911 bool hasA16() const { return HasA16; } 912 hasG16()913 bool hasG16() const { return HasG16; } 914 hasOffset3fBug()915 bool hasOffset3fBug() const { 916 return HasOffset3fBug; 917 } 918 hasImageStoreD16Bug()919 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 920 hasImageGather4D16Bug()921 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 922 hasMADIntraFwdBug()923 bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; } 924 hasNSAEncoding()925 bool hasNSAEncoding() const { return HasNSAEncoding; } 926 getNSAMaxSize()927 unsigned getNSAMaxSize() const { return NSAMaxSize; } 928 hasGFX10_AEncoding()929 bool hasGFX10_AEncoding() const { 930 return GFX10_AEncoding; 931 } 932 hasGFX10_BEncoding()933 bool hasGFX10_BEncoding() const { 934 return GFX10_BEncoding; 935 } 936 hasGFX10_3Insts()937 bool hasGFX10_3Insts() const { 938 return GFX10_3Insts; 939 } 940 941 bool hasMadF16() const; 942 hasMovB64()943 bool hasMovB64() const { return GFX940Insts; } 944 hasLshlAddB64()945 bool hasLshlAddB64() const { return GFX940Insts; } 946 enableSIScheduler()947 bool enableSIScheduler() const { 948 return EnableSIScheduler; 949 } 950 loadStoreOptEnabled()951 bool loadStoreOptEnabled() const { 952 return EnableLoadStoreOpt; 953 } 954 hasSGPRInitBug()955 bool hasSGPRInitBug() const { 956 return SGPRInitBug; 957 } 958 hasUserSGPRInit16Bug()959 bool hasUserSGPRInit16Bug() const { 960 return UserSGPRInit16Bug && isWave32(); 961 } 962 hasNegativeScratchOffsetBug()963 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 964 hasNegativeUnalignedScratchOffsetBug()965 bool hasNegativeUnalignedScratchOffsetBug() const { 966 return NegativeUnalignedScratchOffsetBug; 967 } 968 hasMFMAInlineLiteralBug()969 bool hasMFMAInlineLiteralBug() const { 970 return HasMFMAInlineLiteralBug; 971 } 972 has12DWordStoreHazard()973 bool has12DWordStoreHazard() const { 974 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 975 } 976 977 // \returns true if the subtarget supports DWORDX3 load/store instructions. hasDwordx3LoadStores()978 bool hasDwordx3LoadStores() const { 979 return CIInsts; 980 } 981 hasReadM0MovRelInterpHazard()982 bool hasReadM0MovRelInterpHazard() const { 983 return getGeneration() == AMDGPUSubtarget::GFX9; 984 } 985 hasReadM0SendMsgHazard()986 bool hasReadM0SendMsgHazard() const { 987 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 988 getGeneration() <= AMDGPUSubtarget::GFX9; 989 } 990 hasReadM0LdsDmaHazard()991 bool hasReadM0LdsDmaHazard() const { 992 return getGeneration() == AMDGPUSubtarget::GFX9; 993 } 994 hasReadM0LdsDirectHazard()995 bool hasReadM0LdsDirectHazard() const { 996 return getGeneration() == AMDGPUSubtarget::GFX9; 997 } 998 hasVcmpxPermlaneHazard()999 bool hasVcmpxPermlaneHazard() const { 1000 return HasVcmpxPermlaneHazard; 1001 } 1002 hasVMEMtoScalarWriteHazard()1003 bool hasVMEMtoScalarWriteHazard() const { 1004 return HasVMEMtoScalarWriteHazard; 1005 } 1006 hasSMEMtoVectorWriteHazard()1007 bool hasSMEMtoVectorWriteHazard() const { 1008 return HasSMEMtoVectorWriteHazard; 1009 } 1010 hasLDSMisalignedBug()1011 bool hasLDSMisalignedBug() const { 1012 return LDSMisalignedBug && !EnableCuMode; 1013 } 1014 hasInstFwdPrefetchBug()1015 bool hasInstFwdPrefetchBug() const { 1016 return HasInstFwdPrefetchBug; 1017 } 1018 hasVcmpxExecWARHazard()1019 bool hasVcmpxExecWARHazard() const { 1020 return HasVcmpxExecWARHazard; 1021 } 1022 hasLdsBranchVmemWARHazard()1023 bool hasLdsBranchVmemWARHazard() const { 1024 return HasLdsBranchVmemWARHazard; 1025 } 1026 1027 // Shift amount of a 64 bit shift cannot be a highest allocated register 1028 // if also at the end of the allocation block. hasShift64HighRegBug()1029 bool hasShift64HighRegBug() const { 1030 return GFX90AInsts && !GFX940Insts; 1031 } 1032 1033 // Has one cycle hazard on transcendental instruction feeding a 1034 // non transcendental VALU. hasTransForwardingHazard()1035 bool hasTransForwardingHazard() const { return GFX940Insts; } 1036 1037 // Has one cycle hazard on a VALU instruction partially writing dst with 1038 // a shift of result bits feeding another VALU instruction. hasDstSelForwardingHazard()1039 bool hasDstSelForwardingHazard() const { return GFX940Insts; } 1040 1041 // Cannot use op_sel with v_dot instructions. hasDOTOpSelHazard()1042 bool hasDOTOpSelHazard() const { return GFX940Insts; } 1043 1044 // Does not have HW interlocs for VALU writing and then reading SGPRs. hasVDecCoExecHazard()1045 bool hasVDecCoExecHazard() const { 1046 return GFX940Insts; 1047 } 1048 hasNSAtoVMEMBug()1049 bool hasNSAtoVMEMBug() const { 1050 return HasNSAtoVMEMBug; 1051 } 1052 hasNSAClauseBug()1053 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 1054 hasHardClauses()1055 bool hasHardClauses() const { return getGeneration() >= GFX10; } 1056 hasGFX90AInsts()1057 bool hasGFX90AInsts() const { return GFX90AInsts; } 1058 hasFPAtomicToDenormModeHazard()1059 bool hasFPAtomicToDenormModeHazard() const { 1060 return getGeneration() == GFX10; 1061 } 1062 hasVOP3DPP()1063 bool hasVOP3DPP() const { return getGeneration() >= GFX11; } 1064 hasLdsDirect()1065 bool hasLdsDirect() const { return getGeneration() >= GFX11; } 1066 hasVALUPartialForwardingHazard()1067 bool hasVALUPartialForwardingHazard() const { 1068 return getGeneration() >= GFX11; 1069 } 1070 hasVALUTransUseHazard()1071 bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; } 1072 hasVALUMaskWriteHazard()1073 bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; } 1074 1075 /// Return if operations acting on VGPR tuples require even alignment. needsAlignedVGPRs()1076 bool needsAlignedVGPRs() const { return GFX90AInsts; } 1077 1078 /// Return true if the target has the S_PACK_HL_B32_B16 instruction. hasSPackHL()1079 bool hasSPackHL() const { return GFX11Insts; } 1080 1081 /// Return true if the target's EXP instruction has the COMPR flag, which 1082 /// affects the meaning of the EN (enable) bits. hasCompressedExport()1083 bool hasCompressedExport() const { return !GFX11Insts; } 1084 1085 /// Return true if the target's EXP instruction supports the NULL export 1086 /// target. hasNullExportTarget()1087 bool hasNullExportTarget() const { return !GFX11Insts; } 1088 hasGFX11FullVGPRs()1089 bool hasGFX11FullVGPRs() const { return HasGFX11FullVGPRs; } 1090 hasVOPDInsts()1091 bool hasVOPDInsts() const { return HasVOPDInsts; } 1092 hasFlatScratchSVSSwizzleBug()1093 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; } 1094 1095 /// Return true if the target has the S_DELAY_ALU instruction. hasDelayAlu()1096 bool hasDelayAlu() const { return GFX11Insts; } 1097 hasPackedTID()1098 bool hasPackedTID() const { return HasPackedTID; } 1099 1100 // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that 1101 // hasGFX90AInsts is also true. hasGFX940Insts()1102 bool hasGFX940Insts() const { return GFX940Insts; } 1103 1104 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 1105 /// SGPRs 1106 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 1107 1108 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 1109 /// VGPRs 1110 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 1111 1112 /// Return occupancy for the given function. Used LDS and a number of 1113 /// registers if provided. 1114 /// Note, occupancy can be affected by the scratch allocation as well, but 1115 /// we do not have enough information to compute it. 1116 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 1117 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 1118 1119 /// \returns true if the flat_scratch register should be initialized with the 1120 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()1121 bool flatScratchIsPointer() const { 1122 return getGeneration() >= AMDGPUSubtarget::GFX9; 1123 } 1124 1125 /// \returns true if the flat_scratch register is initialized by the HW. 1126 /// In this case it is readonly. flatScratchIsArchitected()1127 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 1128 1129 /// \returns true if the machine has merged shaders in which s0-s7 are 1130 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()1131 bool hasMergedShaders() const { 1132 return getGeneration() >= GFX9; 1133 } 1134 1135 // \returns true if the target supports the pre-NGG legacy geometry path. hasLegacyGeometry()1136 bool hasLegacyGeometry() const { return getGeneration() < GFX11; } 1137 1138 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()1139 unsigned getSGPRAllocGranule() const { 1140 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1141 } 1142 1143 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()1144 unsigned getSGPREncodingGranule() const { 1145 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1146 } 1147 1148 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()1149 unsigned getTotalNumSGPRs() const { 1150 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1151 } 1152 1153 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()1154 unsigned getAddressableNumSGPRs() const { 1155 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1156 } 1157 1158 /// \returns Minimum number of SGPRs that meets the given number of waves per 1159 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)1160 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1161 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1162 } 1163 1164 /// \returns Maximum number of SGPRs that meets the given number of waves per 1165 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1166 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1167 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1168 } 1169 1170 /// \returns Reserved number of SGPRs. This is common 1171 /// utility function called by MachineFunction and 1172 /// Function variants of getReservedNumSGPRs. 1173 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const; 1174 /// \returns Reserved number of SGPRs for given machine function \p MF. 1175 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1176 1177 /// \returns Reserved number of SGPRs for given function \p F. 1178 unsigned getReservedNumSGPRs(const Function &F) const; 1179 1180 /// \returns max num SGPRs. This is the common utility 1181 /// function called by MachineFunction and Function 1182 /// variants of getMaxNumSGPRs. 1183 unsigned getBaseMaxNumSGPRs(const Function &F, 1184 std::pair<unsigned, unsigned> WavesPerEU, 1185 unsigned PreloadedSGPRs, 1186 unsigned ReservedNumSGPRs) const; 1187 1188 /// \returns Maximum number of SGPRs that meets number of waves per execution 1189 /// unit requirement for function \p MF, or number of SGPRs explicitly 1190 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1191 /// 1192 /// \returns Value that meets number of waves per execution unit requirement 1193 /// if explicitly requested value cannot be converted to integer, violates 1194 /// subtarget's specifications, or does not meet number of waves per execution 1195 /// unit requirement. 1196 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1197 1198 /// \returns Maximum number of SGPRs that meets number of waves per execution 1199 /// unit requirement for function \p F, or number of SGPRs explicitly 1200 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F. 1201 /// 1202 /// \returns Value that meets number of waves per execution unit requirement 1203 /// if explicitly requested value cannot be converted to integer, violates 1204 /// subtarget's specifications, or does not meet number of waves per execution 1205 /// unit requirement. 1206 unsigned getMaxNumSGPRs(const Function &F) const; 1207 1208 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule()1209 unsigned getVGPRAllocGranule() const { 1210 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1211 } 1212 1213 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()1214 unsigned getVGPREncodingGranule() const { 1215 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1216 } 1217 1218 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()1219 unsigned getTotalNumVGPRs() const { 1220 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1221 } 1222 1223 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs()1224 unsigned getAddressableNumVGPRs() const { 1225 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1226 } 1227 1228 /// \returns the minimum number of VGPRs that will prevent achieving more than 1229 /// the specified number of waves \p WavesPerEU. getMinNumVGPRs(unsigned WavesPerEU)1230 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1231 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1232 } 1233 1234 /// \returns the maximum number of VGPRs that can be used and still achieved 1235 /// at least the specified number of waves \p WavesPerEU. getMaxNumVGPRs(unsigned WavesPerEU)1236 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1237 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1238 } 1239 1240 /// \returns max num VGPRs. This is the common utility function 1241 /// called by MachineFunction and Function variants of getMaxNumVGPRs. 1242 unsigned getBaseMaxNumVGPRs(const Function &F, 1243 std::pair<unsigned, unsigned> WavesPerEU) const; 1244 /// \returns Maximum number of VGPRs that meets number of waves per execution 1245 /// unit requirement for function \p F, or number of VGPRs explicitly 1246 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F. 1247 /// 1248 /// \returns Value that meets number of waves per execution unit requirement 1249 /// if explicitly requested value cannot be converted to integer, violates 1250 /// subtarget's specifications, or does not meet number of waves per execution 1251 /// unit requirement. 1252 unsigned getMaxNumVGPRs(const Function &F) const; 1253 getMaxNumAGPRs(const Function & F)1254 unsigned getMaxNumAGPRs(const Function &F) const { 1255 return getMaxNumVGPRs(F); 1256 } 1257 1258 /// \returns Maximum number of VGPRs that meets number of waves per execution 1259 /// unit requirement for function \p MF, or number of VGPRs explicitly 1260 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1261 /// 1262 /// \returns Value that meets number of waves per execution unit requirement 1263 /// if explicitly requested value cannot be converted to integer, violates 1264 /// subtarget's specifications, or does not meet number of waves per execution 1265 /// unit requirement. 1266 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1267 1268 void getPostRAMutations( 1269 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1270 const override; 1271 1272 std::unique_ptr<ScheduleDAGMutation> 1273 createFillMFMAShadowMutation(const TargetInstrInfo *TII) const; 1274 isWave32()1275 bool isWave32() const { 1276 return getWavefrontSize() == 32; 1277 } 1278 isWave64()1279 bool isWave64() const { 1280 return getWavefrontSize() == 64; 1281 } 1282 getBoolRC()1283 const TargetRegisterClass *getBoolRC() const { 1284 return getRegisterInfo()->getBoolRC(); 1285 } 1286 1287 /// \returns Maximum number of work groups per compute unit supported by the 1288 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1289 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1290 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1291 } 1292 1293 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1294 unsigned getMinFlatWorkGroupSize() const override { 1295 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1296 } 1297 1298 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1299 unsigned getMaxFlatWorkGroupSize() const override { 1300 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1301 } 1302 1303 /// \returns Number of waves per execution unit required to support the given 1304 /// \p FlatWorkGroupSize. 1305 unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1306 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1307 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1308 } 1309 1310 /// \returns Minimum number of waves per execution unit supported by the 1311 /// subtarget. getMinWavesPerEU()1312 unsigned getMinWavesPerEU() const override { 1313 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1314 } 1315 1316 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1317 SDep &Dep) const override; 1318 1319 // \returns true if it's beneficial on this subtarget for the scheduler to 1320 // cluster stores as well as loads. shouldClusterStores()1321 bool shouldClusterStores() const { return getGeneration() >= GFX11; } 1322 1323 // \returns the number of address arguments from which to enable MIMG NSA 1324 // on supported architectures. 1325 unsigned getNSAThreshold(const MachineFunction &MF) const; 1326 }; 1327 1328 } // end namespace llvm 1329 1330 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1331