1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //==-----------------------------------------------------------------------===// 8 // 9 /// \file 10 /// AMD GCN specific subclass of TargetSubtarget. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 16 17 #include "AMDGPUCallLowering.h" 18 #include "AMDGPUSubtarget.h" 19 #include "SIFrameLowering.h" 20 #include "SIISelLowering.h" 21 #include "SIInstrInfo.h" 22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 23 24 namespace llvm { 25 26 class MCInst; 27 class MCInstrInfo; 28 29 } // namespace llvm 30 31 #define GET_SUBTARGETINFO_HEADER 32 #include "AMDGPUGenSubtargetInfo.inc" 33 34 namespace llvm { 35 36 class GCNTargetMachine; 37 38 class GCNSubtarget final : public AMDGPUGenSubtargetInfo, 39 public AMDGPUSubtarget { 40 41 using AMDGPUSubtarget::getMaxWavesPerEU; 42 43 public: 44 // Following 2 enums are documented at: 45 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi 46 enum class TrapHandlerAbi { 47 NONE = 0x00, 48 AMDHSA = 0x01, 49 }; 50 51 enum class TrapID { 52 LLVMAMDHSATrap = 0x02, 53 LLVMAMDHSADebugTrap = 0x03, 54 }; 55 56 private: 57 /// GlobalISel related APIs. 58 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 59 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo; 60 std::unique_ptr<InstructionSelector> InstSelector; 61 std::unique_ptr<LegalizerInfo> Legalizer; 62 std::unique_ptr<RegisterBankInfo> RegBankInfo; 63 64 protected: 65 // Basic subtarget description. 66 Triple TargetTriple; 67 AMDGPU::IsaInfo::AMDGPUTargetID TargetID; 68 unsigned Gen; 69 InstrItineraryData InstrItins; 70 int LDSBankCount; 71 unsigned MaxPrivateElementSize; 72 73 // Possibly statically set by tablegen, but may want to be overridden. 74 bool FastFMAF32; 75 bool FastDenormalF32; 76 bool HalfRate64Ops; 77 bool FullRate64Ops; 78 79 // Dynamically set bits that enable features. 80 bool FlatForGlobal; 81 bool AutoWaitcntBeforeBarrier; 82 bool UnalignedScratchAccess; 83 bool UnalignedAccessMode; 84 bool HasApertureRegs; 85 bool SupportsXNACK; 86 87 // This should not be used directly. 'TargetID' tracks the dynamic settings 88 // for XNACK. 89 bool EnableXNACK; 90 91 bool EnableTgSplit; 92 bool EnableCuMode; 93 bool TrapHandler; 94 95 // Used as options. 96 bool EnableLoadStoreOpt; 97 bool EnableUnsafeDSOffsetFolding; 98 bool EnableSIScheduler; 99 bool EnableDS128; 100 bool EnablePRTStrictNull; 101 bool DumpCode; 102 103 // Subtarget statically properties set by tablegen 104 bool FP64; 105 bool FMA; 106 bool MIMG_R128; 107 bool IsGCN; 108 bool CIInsts; 109 bool GFX8Insts; 110 bool GFX9Insts; 111 bool GFX90AInsts; 112 bool GFX10Insts; 113 bool GFX10_3Insts; 114 bool GFX7GFX8GFX9Insts; 115 bool SGPRInitBug; 116 bool NegativeScratchOffsetBug; 117 bool NegativeUnalignedScratchOffsetBug; 118 bool HasSMemRealTime; 119 bool HasIntClamp; 120 bool HasFmaMixInsts; 121 bool HasMovrel; 122 bool HasVGPRIndexMode; 123 bool HasScalarStores; 124 bool HasScalarAtomics; 125 bool HasSDWAOmod; 126 bool HasSDWAScalar; 127 bool HasSDWASdst; 128 bool HasSDWAMac; 129 bool HasSDWAOutModsVOPC; 130 bool HasDPP; 131 bool HasDPP8; 132 bool Has64BitDPP; 133 bool HasPackedFP32Ops; 134 bool HasExtendedImageInsts; 135 bool HasR128A16; 136 bool HasGFX10A16; 137 bool HasG16; 138 bool HasNSAEncoding; 139 bool GFX10_BEncoding; 140 bool HasDLInsts; 141 bool HasDot1Insts; 142 bool HasDot2Insts; 143 bool HasDot3Insts; 144 bool HasDot4Insts; 145 bool HasDot5Insts; 146 bool HasDot6Insts; 147 bool HasDot7Insts; 148 bool HasMAIInsts; 149 bool HasPkFmacF16Inst; 150 bool HasAtomicFaddInsts; 151 bool SupportsSRAMECC; 152 153 // This should not be used directly. 'TargetID' tracks the dynamic settings 154 // for SRAMECC. 155 bool EnableSRAMECC; 156 157 bool HasNoSdstCMPX; 158 bool HasVscnt; 159 bool HasGetWaveIdInst; 160 bool HasSMemTimeInst; 161 bool HasShaderCyclesRegister; 162 bool HasRegisterBanking; 163 bool HasVOP3Literal; 164 bool HasNoDataDepHazard; 165 bool FlatAddressSpace; 166 bool FlatInstOffsets; 167 bool FlatGlobalInsts; 168 bool FlatScratchInsts; 169 bool ScalarFlatScratchInsts; 170 bool HasArchitectedFlatScratch; 171 bool AddNoCarryInsts; 172 bool HasUnpackedD16VMem; 173 bool R600ALUInst; 174 bool CaymanISA; 175 bool CFALUBug; 176 bool LDSMisalignedBug; 177 bool HasMFMAInlineLiteralBug; 178 bool HasVertexCache; 179 short TexVTXClauseSize; 180 bool UnalignedBufferAccess; 181 bool UnalignedDSAccess; 182 bool HasPackedTID; 183 bool ScalarizeGlobal; 184 185 bool HasVcmpxPermlaneHazard; 186 bool HasVMEMtoScalarWriteHazard; 187 bool HasSMEMtoVectorWriteHazard; 188 bool HasInstFwdPrefetchBug; 189 bool HasVcmpxExecWARHazard; 190 bool HasLdsBranchVmemWARHazard; 191 bool HasNSAtoVMEMBug; 192 bool HasNSAClauseBug; 193 bool HasOffset3fBug; 194 bool HasFlatSegmentOffsetBug; 195 bool HasImageStoreD16Bug; 196 bool HasImageGather4D16Bug; 197 198 // Dummy feature to use for assembler in tablegen. 199 bool FeatureDisable; 200 201 SelectionDAGTargetInfo TSInfo; 202 private: 203 SIInstrInfo InstrInfo; 204 SITargetLowering TLInfo; 205 SIFrameLowering FrameLowering; 206 207 public: 208 // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword. 209 static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1); 210 211 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 212 const GCNTargetMachine &TM); 213 ~GCNSubtarget() override; 214 215 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 216 StringRef GPU, StringRef FS); 217 getInstrInfo()218 const SIInstrInfo *getInstrInfo() const override { 219 return &InstrInfo; 220 } 221 getFrameLowering()222 const SIFrameLowering *getFrameLowering() const override { 223 return &FrameLowering; 224 } 225 getTargetLowering()226 const SITargetLowering *getTargetLowering() const override { 227 return &TLInfo; 228 } 229 getRegisterInfo()230 const SIRegisterInfo *getRegisterInfo() const override { 231 return &InstrInfo.getRegisterInfo(); 232 } 233 getCallLowering()234 const CallLowering *getCallLowering() const override { 235 return CallLoweringInfo.get(); 236 } 237 getInlineAsmLowering()238 const InlineAsmLowering *getInlineAsmLowering() const override { 239 return InlineAsmLoweringInfo.get(); 240 } 241 getInstructionSelector()242 InstructionSelector *getInstructionSelector() const override { 243 return InstSelector.get(); 244 } 245 getLegalizerInfo()246 const LegalizerInfo *getLegalizerInfo() const override { 247 return Legalizer.get(); 248 } 249 getRegBankInfo()250 const RegisterBankInfo *getRegBankInfo() const override { 251 return RegBankInfo.get(); 252 } 253 getTargetID()254 const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const { 255 return TargetID; 256 } 257 258 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()259 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 260 return &TSInfo; 261 } 262 getInstrItineraryData()263 const InstrItineraryData *getInstrItineraryData() const override { 264 return &InstrItins; 265 } 266 267 void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS); 268 getGeneration()269 Generation getGeneration() const { 270 return (Generation)Gen; 271 } 272 273 /// Return the number of high bits known to be zero fror a frame index. getKnownHighZeroBitsForFrameIndex()274 unsigned getKnownHighZeroBitsForFrameIndex() const { 275 return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2(); 276 } 277 getLDSBankCount()278 int getLDSBankCount() const { 279 return LDSBankCount; 280 } 281 282 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const { 283 return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16; 284 } 285 286 unsigned getConstantBusLimit(unsigned Opcode) const; 287 hasIntClamp()288 bool hasIntClamp() const { 289 return HasIntClamp; 290 } 291 hasFP64()292 bool hasFP64() const { 293 return FP64; 294 } 295 hasMIMG_R128()296 bool hasMIMG_R128() const { 297 return MIMG_R128; 298 } 299 hasHWFP64()300 bool hasHWFP64() const { 301 return FP64; 302 } 303 hasFastFMAF32()304 bool hasFastFMAF32() const { 305 return FastFMAF32; 306 } 307 hasHalfRate64Ops()308 bool hasHalfRate64Ops() const { 309 return HalfRate64Ops; 310 } 311 hasFullRate64Ops()312 bool hasFullRate64Ops() const { 313 return FullRate64Ops; 314 } 315 hasAddr64()316 bool hasAddr64() const { 317 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 318 } 319 hasFlat()320 bool hasFlat() const { 321 return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS); 322 } 323 324 // Return true if the target only has the reverse operand versions of VALU 325 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32). hasOnlyRevVALUShifts()326 bool hasOnlyRevVALUShifts() const { 327 return getGeneration() >= VOLCANIC_ISLANDS; 328 } 329 hasFractBug()330 bool hasFractBug() const { 331 return getGeneration() == SOUTHERN_ISLANDS; 332 } 333 hasBFE()334 bool hasBFE() const { 335 return true; 336 } 337 hasBFI()338 bool hasBFI() const { 339 return true; 340 } 341 hasBFM()342 bool hasBFM() const { 343 return hasBFE(); 344 } 345 hasBCNT(unsigned Size)346 bool hasBCNT(unsigned Size) const { 347 return true; 348 } 349 hasFFBL()350 bool hasFFBL() const { 351 return true; 352 } 353 hasFFBH()354 bool hasFFBH() const { 355 return true; 356 } 357 hasMed3_16()358 bool hasMed3_16() const { 359 return getGeneration() >= AMDGPUSubtarget::GFX9; 360 } 361 hasMin3Max3_16()362 bool hasMin3Max3_16() const { 363 return getGeneration() >= AMDGPUSubtarget::GFX9; 364 } 365 hasFmaMixInsts()366 bool hasFmaMixInsts() const { 367 return HasFmaMixInsts; 368 } 369 hasCARRY()370 bool hasCARRY() const { 371 return true; 372 } 373 hasFMA()374 bool hasFMA() const { 375 return FMA; 376 } 377 hasSwap()378 bool hasSwap() const { 379 return GFX9Insts; 380 } 381 hasScalarPackInsts()382 bool hasScalarPackInsts() const { 383 return GFX9Insts; 384 } 385 hasScalarMulHiInsts()386 bool hasScalarMulHiInsts() const { 387 return GFX9Insts; 388 } 389 getTrapHandlerAbi()390 TrapHandlerAbi getTrapHandlerAbi() const { 391 return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE; 392 } 393 supportsGetDoorbellID()394 bool supportsGetDoorbellID() const { 395 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets. 396 return getGeneration() >= GFX9; 397 } 398 399 /// True if the offset field of DS instructions works as expected. On SI, the 400 /// offset uses a 16-bit adder and does not always wrap properly. hasUsableDSOffset()401 bool hasUsableDSOffset() const { 402 return getGeneration() >= SEA_ISLANDS; 403 } 404 unsafeDSOffsetFoldingEnabled()405 bool unsafeDSOffsetFoldingEnabled() const { 406 return EnableUnsafeDSOffsetFolding; 407 } 408 409 /// Condition output from div_scale is usable. hasUsableDivScaleConditionOutput()410 bool hasUsableDivScaleConditionOutput() const { 411 return getGeneration() != SOUTHERN_ISLANDS; 412 } 413 414 /// Extra wait hazard is needed in some cases before 415 /// s_cbranch_vccnz/s_cbranch_vccz. hasReadVCCZBug()416 bool hasReadVCCZBug() const { 417 return getGeneration() <= SEA_ISLANDS; 418 } 419 420 /// Writes to VCC_LO/VCC_HI update the VCCZ flag. partialVCCWritesUpdateVCCZ()421 bool partialVCCWritesUpdateVCCZ() const { 422 return getGeneration() >= GFX10; 423 } 424 425 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR 426 /// was written by a VALU instruction. hasSMRDReadVALUDefHazard()427 bool hasSMRDReadVALUDefHazard() const { 428 return getGeneration() == SOUTHERN_ISLANDS; 429 } 430 431 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the 432 /// SGPR was written by a VALU Instruction. hasVMEMReadSGPRVALUDefHazard()433 bool hasVMEMReadSGPRVALUDefHazard() const { 434 return getGeneration() >= VOLCANIC_ISLANDS; 435 } 436 hasRFEHazards()437 bool hasRFEHazards() const { 438 return getGeneration() >= VOLCANIC_ISLANDS; 439 } 440 441 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32. getSetRegWaitStates()442 unsigned getSetRegWaitStates() const { 443 return getGeneration() <= SEA_ISLANDS ? 1 : 2; 444 } 445 dumpCode()446 bool dumpCode() const { 447 return DumpCode; 448 } 449 450 /// Return the amount of LDS that can be used that will not restrict the 451 /// occupancy lower than WaveCount. 452 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 453 const Function &) const; 454 supportsMinMaxDenormModes()455 bool supportsMinMaxDenormModes() const { 456 return getGeneration() >= AMDGPUSubtarget::GFX9; 457 } 458 459 /// \returns If target supports S_DENORM_MODE. hasDenormModeInst()460 bool hasDenormModeInst() const { 461 return getGeneration() >= AMDGPUSubtarget::GFX10; 462 } 463 useFlatForGlobal()464 bool useFlatForGlobal() const { 465 return FlatForGlobal; 466 } 467 468 /// \returns If target supports ds_read/write_b128 and user enables generation 469 /// of ds_read/write_b128. useDS128()470 bool useDS128() const { 471 return CIInsts && EnableDS128; 472 } 473 474 /// \return If target supports ds_read/write_b96/128. hasDS96AndDS128()475 bool hasDS96AndDS128() const { 476 return CIInsts; 477 } 478 479 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64 haveRoundOpsF64()480 bool haveRoundOpsF64() const { 481 return CIInsts; 482 } 483 484 /// \returns If MUBUF instructions always perform range checking, even for 485 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()486 bool privateMemoryResourceIsRangeChecked() const { 487 return getGeneration() < AMDGPUSubtarget::GFX9; 488 } 489 490 /// \returns If target requires PRT Struct NULL support (zero result registers 491 /// for sparse texture support). usePRTStrictNull()492 bool usePRTStrictNull() const { 493 return EnablePRTStrictNull; 494 } 495 hasAutoWaitcntBeforeBarrier()496 bool hasAutoWaitcntBeforeBarrier() const { 497 return AutoWaitcntBeforeBarrier; 498 } 499 hasUnalignedBufferAccess()500 bool hasUnalignedBufferAccess() const { 501 return UnalignedBufferAccess; 502 } 503 hasUnalignedBufferAccessEnabled()504 bool hasUnalignedBufferAccessEnabled() const { 505 return UnalignedBufferAccess && UnalignedAccessMode; 506 } 507 hasUnalignedDSAccess()508 bool hasUnalignedDSAccess() const { 509 return UnalignedDSAccess; 510 } 511 hasUnalignedDSAccessEnabled()512 bool hasUnalignedDSAccessEnabled() const { 513 return UnalignedDSAccess && UnalignedAccessMode; 514 } 515 hasUnalignedScratchAccess()516 bool hasUnalignedScratchAccess() const { 517 return UnalignedScratchAccess; 518 } 519 hasUnalignedAccessMode()520 bool hasUnalignedAccessMode() const { 521 return UnalignedAccessMode; 522 } 523 hasApertureRegs()524 bool hasApertureRegs() const { 525 return HasApertureRegs; 526 } 527 isTrapHandlerEnabled()528 bool isTrapHandlerEnabled() const { 529 return TrapHandler; 530 } 531 isXNACKEnabled()532 bool isXNACKEnabled() const { 533 return TargetID.isXnackOnOrAny(); 534 } 535 isTgSplitEnabled()536 bool isTgSplitEnabled() const { 537 return EnableTgSplit; 538 } 539 isCuModeEnabled()540 bool isCuModeEnabled() const { 541 return EnableCuMode; 542 } 543 hasFlatAddressSpace()544 bool hasFlatAddressSpace() const { 545 return FlatAddressSpace; 546 } 547 hasFlatScrRegister()548 bool hasFlatScrRegister() const { 549 return hasFlatAddressSpace(); 550 } 551 hasFlatInstOffsets()552 bool hasFlatInstOffsets() const { 553 return FlatInstOffsets; 554 } 555 hasFlatGlobalInsts()556 bool hasFlatGlobalInsts() const { 557 return FlatGlobalInsts; 558 } 559 hasFlatScratchInsts()560 bool hasFlatScratchInsts() const { 561 return FlatScratchInsts; 562 } 563 564 // Check if target supports ST addressing mode with FLAT scratch instructions. 565 // The ST addressing mode means no registers are used, either VGPR or SGPR, 566 // but only immediate offset is swizzled and added to the FLAT scratch base. hasFlatScratchSTMode()567 bool hasFlatScratchSTMode() const { 568 return hasFlatScratchInsts() && hasGFX10_3Insts(); 569 } 570 hasScalarFlatScratchInsts()571 bool hasScalarFlatScratchInsts() const { 572 return ScalarFlatScratchInsts; 573 } 574 hasGlobalAddTidInsts()575 bool hasGlobalAddTidInsts() const { 576 return GFX10_BEncoding; 577 } 578 hasAtomicCSub()579 bool hasAtomicCSub() const { 580 return GFX10_BEncoding; 581 } 582 hasMultiDwordFlatScratchAddressing()583 bool hasMultiDwordFlatScratchAddressing() const { 584 return getGeneration() >= GFX9; 585 } 586 hasFlatSegmentOffsetBug()587 bool hasFlatSegmentOffsetBug() const { 588 return HasFlatSegmentOffsetBug; 589 } 590 hasFlatLgkmVMemCountInOrder()591 bool hasFlatLgkmVMemCountInOrder() const { 592 return getGeneration() > GFX9; 593 } 594 hasD16LoadStore()595 bool hasD16LoadStore() const { 596 return getGeneration() >= GFX9; 597 } 598 d16PreservesUnusedBits()599 bool d16PreservesUnusedBits() const { 600 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny(); 601 } 602 hasD16Images()603 bool hasD16Images() const { 604 return getGeneration() >= VOLCANIC_ISLANDS; 605 } 606 607 /// Return if most LDS instructions have an m0 use that require m0 to be 608 /// iniitalized. ldsRequiresM0Init()609 bool ldsRequiresM0Init() const { 610 return getGeneration() < GFX9; 611 } 612 613 // True if the hardware rewinds and replays GWS operations if a wave is 614 // preempted. 615 // 616 // If this is false, a GWS operation requires testing if a nack set the 617 // MEM_VIOL bit, and repeating if so. hasGWSAutoReplay()618 bool hasGWSAutoReplay() const { 619 return getGeneration() >= GFX9; 620 } 621 622 /// \returns if target has ds_gws_sema_release_all instruction. hasGWSSemaReleaseAll()623 bool hasGWSSemaReleaseAll() const { 624 return CIInsts; 625 } 626 627 /// \returns true if the target has integer add/sub instructions that do not 628 /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32, 629 /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier 630 /// for saturation. hasAddNoCarry()631 bool hasAddNoCarry() const { 632 return AddNoCarryInsts; 633 } 634 hasUnpackedD16VMem()635 bool hasUnpackedD16VMem() const { 636 return HasUnpackedD16VMem; 637 } 638 639 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)640 bool isMesaGfxShader(const Function &F) const { 641 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 642 } 643 hasMad64_32()644 bool hasMad64_32() const { 645 return getGeneration() >= SEA_ISLANDS; 646 } 647 hasSDWAOmod()648 bool hasSDWAOmod() const { 649 return HasSDWAOmod; 650 } 651 hasSDWAScalar()652 bool hasSDWAScalar() const { 653 return HasSDWAScalar; 654 } 655 hasSDWASdst()656 bool hasSDWASdst() const { 657 return HasSDWASdst; 658 } 659 hasSDWAMac()660 bool hasSDWAMac() const { 661 return HasSDWAMac; 662 } 663 hasSDWAOutModsVOPC()664 bool hasSDWAOutModsVOPC() const { 665 return HasSDWAOutModsVOPC; 666 } 667 hasDLInsts()668 bool hasDLInsts() const { 669 return HasDLInsts; 670 } 671 hasDot1Insts()672 bool hasDot1Insts() const { 673 return HasDot1Insts; 674 } 675 hasDot2Insts()676 bool hasDot2Insts() const { 677 return HasDot2Insts; 678 } 679 hasDot3Insts()680 bool hasDot3Insts() const { 681 return HasDot3Insts; 682 } 683 hasDot4Insts()684 bool hasDot4Insts() const { 685 return HasDot4Insts; 686 } 687 hasDot5Insts()688 bool hasDot5Insts() const { 689 return HasDot5Insts; 690 } 691 hasDot6Insts()692 bool hasDot6Insts() const { 693 return HasDot6Insts; 694 } 695 hasDot7Insts()696 bool hasDot7Insts() const { 697 return HasDot7Insts; 698 } 699 hasMAIInsts()700 bool hasMAIInsts() const { 701 return HasMAIInsts; 702 } 703 hasPkFmacF16Inst()704 bool hasPkFmacF16Inst() const { 705 return HasPkFmacF16Inst; 706 } 707 hasAtomicFaddInsts()708 bool hasAtomicFaddInsts() const { 709 return HasAtomicFaddInsts; 710 } 711 hasNoSdstCMPX()712 bool hasNoSdstCMPX() const { 713 return HasNoSdstCMPX; 714 } 715 hasVscnt()716 bool hasVscnt() const { 717 return HasVscnt; 718 } 719 hasGetWaveIdInst()720 bool hasGetWaveIdInst() const { 721 return HasGetWaveIdInst; 722 } 723 hasSMemTimeInst()724 bool hasSMemTimeInst() const { 725 return HasSMemTimeInst; 726 } 727 hasShaderCyclesRegister()728 bool hasShaderCyclesRegister() const { 729 return HasShaderCyclesRegister; 730 } 731 hasRegisterBanking()732 bool hasRegisterBanking() const { 733 return HasRegisterBanking; 734 } 735 hasVOP3Literal()736 bool hasVOP3Literal() const { 737 return HasVOP3Literal; 738 } 739 hasNoDataDepHazard()740 bool hasNoDataDepHazard() const { 741 return HasNoDataDepHazard; 742 } 743 vmemWriteNeedsExpWaitcnt()744 bool vmemWriteNeedsExpWaitcnt() const { 745 return getGeneration() < SEA_ISLANDS; 746 } 747 748 // Scratch is allocated in 256 dword per wave blocks for the entire 749 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 750 // is 4-byte aligned. 751 // 752 // Only 4-byte alignment is really needed to access anything. Transformations 753 // on the pointer value itself may rely on the alignment / known low bits of 754 // the pointer. Set this to something above the minimum to avoid needing 755 // dynamic realignment in common cases. getStackAlignment()756 Align getStackAlignment() const { return Align(16); } 757 enableMachineScheduler()758 bool enableMachineScheduler() const override { 759 return true; 760 } 761 762 bool useAA() const override; 763 enableSubRegLiveness()764 bool enableSubRegLiveness() const override { 765 return true; 766 } 767 setScalarizeGlobalBehavior(bool b)768 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()769 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 770 771 // static wrappers 772 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 773 774 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()775 bool enableEarlyIfConversion() const override { 776 return true; 777 } 778 779 bool enableFlatScratch() const; 780 781 void overrideSchedPolicy(MachineSchedPolicy &Policy, 782 unsigned NumRegionInstrs) const override; 783 getMaxNumUserSGPRs()784 unsigned getMaxNumUserSGPRs() const { 785 return 16; 786 } 787 hasSMemRealTime()788 bool hasSMemRealTime() const { 789 return HasSMemRealTime; 790 } 791 hasMovrel()792 bool hasMovrel() const { 793 return HasMovrel; 794 } 795 hasVGPRIndexMode()796 bool hasVGPRIndexMode() const { 797 return HasVGPRIndexMode; 798 } 799 800 bool useVGPRIndexMode() const; 801 hasScalarCompareEq64()802 bool hasScalarCompareEq64() const { 803 return getGeneration() >= VOLCANIC_ISLANDS; 804 } 805 hasScalarStores()806 bool hasScalarStores() const { 807 return HasScalarStores; 808 } 809 hasScalarAtomics()810 bool hasScalarAtomics() const { 811 return HasScalarAtomics; 812 } 813 hasLDSFPAtomics()814 bool hasLDSFPAtomics() const { 815 return GFX8Insts; 816 } 817 818 /// \returns true if the subtarget has the v_permlanex16_b32 instruction. hasPermLaneX16()819 bool hasPermLaneX16() const { return getGeneration() >= GFX10; } 820 hasDPP()821 bool hasDPP() const { 822 return HasDPP; 823 } 824 hasDPPBroadcasts()825 bool hasDPPBroadcasts() const { 826 return HasDPP && getGeneration() < GFX10; 827 } 828 hasDPPWavefrontShifts()829 bool hasDPPWavefrontShifts() const { 830 return HasDPP && getGeneration() < GFX10; 831 } 832 hasDPP8()833 bool hasDPP8() const { 834 return HasDPP8; 835 } 836 has64BitDPP()837 bool has64BitDPP() const { 838 return Has64BitDPP; 839 } 840 hasPackedFP32Ops()841 bool hasPackedFP32Ops() const { 842 return HasPackedFP32Ops; 843 } 844 hasFmaakFmamkF32Insts()845 bool hasFmaakFmamkF32Insts() const { 846 return getGeneration() >= GFX10; 847 } 848 hasExtendedImageInsts()849 bool hasExtendedImageInsts() const { 850 return HasExtendedImageInsts; 851 } 852 hasR128A16()853 bool hasR128A16() const { 854 return HasR128A16; 855 } 856 hasGFX10A16()857 bool hasGFX10A16() const { 858 return HasGFX10A16; 859 } 860 hasA16()861 bool hasA16() const { return hasR128A16() || hasGFX10A16(); } 862 hasG16()863 bool hasG16() const { return HasG16; } 864 hasOffset3fBug()865 bool hasOffset3fBug() const { 866 return HasOffset3fBug; 867 } 868 hasImageStoreD16Bug()869 bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; } 870 hasImageGather4D16Bug()871 bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; } 872 hasNSAEncoding()873 bool hasNSAEncoding() const { return HasNSAEncoding; } 874 hasGFX10_BEncoding()875 bool hasGFX10_BEncoding() const { 876 return GFX10_BEncoding; 877 } 878 hasGFX10_3Insts()879 bool hasGFX10_3Insts() const { 880 return GFX10_3Insts; 881 } 882 883 bool hasMadF16() const; 884 enableSIScheduler()885 bool enableSIScheduler() const { 886 return EnableSIScheduler; 887 } 888 loadStoreOptEnabled()889 bool loadStoreOptEnabled() const { 890 return EnableLoadStoreOpt; 891 } 892 hasSGPRInitBug()893 bool hasSGPRInitBug() const { 894 return SGPRInitBug; 895 } 896 hasNegativeScratchOffsetBug()897 bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; } 898 hasNegativeUnalignedScratchOffsetBug()899 bool hasNegativeUnalignedScratchOffsetBug() const { 900 return NegativeUnalignedScratchOffsetBug; 901 } 902 hasMFMAInlineLiteralBug()903 bool hasMFMAInlineLiteralBug() const { 904 return HasMFMAInlineLiteralBug; 905 } 906 has12DWordStoreHazard()907 bool has12DWordStoreHazard() const { 908 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 909 } 910 911 // \returns true if the subtarget supports DWORDX3 load/store instructions. hasDwordx3LoadStores()912 bool hasDwordx3LoadStores() const { 913 return CIInsts; 914 } 915 hasReadM0MovRelInterpHazard()916 bool hasReadM0MovRelInterpHazard() const { 917 return getGeneration() == AMDGPUSubtarget::GFX9; 918 } 919 hasReadM0SendMsgHazard()920 bool hasReadM0SendMsgHazard() const { 921 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS && 922 getGeneration() <= AMDGPUSubtarget::GFX9; 923 } 924 hasVcmpxPermlaneHazard()925 bool hasVcmpxPermlaneHazard() const { 926 return HasVcmpxPermlaneHazard; 927 } 928 hasVMEMtoScalarWriteHazard()929 bool hasVMEMtoScalarWriteHazard() const { 930 return HasVMEMtoScalarWriteHazard; 931 } 932 hasSMEMtoVectorWriteHazard()933 bool hasSMEMtoVectorWriteHazard() const { 934 return HasSMEMtoVectorWriteHazard; 935 } 936 hasLDSMisalignedBug()937 bool hasLDSMisalignedBug() const { 938 return LDSMisalignedBug && !EnableCuMode; 939 } 940 hasInstFwdPrefetchBug()941 bool hasInstFwdPrefetchBug() const { 942 return HasInstFwdPrefetchBug; 943 } 944 hasVcmpxExecWARHazard()945 bool hasVcmpxExecWARHazard() const { 946 return HasVcmpxExecWARHazard; 947 } 948 hasLdsBranchVmemWARHazard()949 bool hasLdsBranchVmemWARHazard() const { 950 return HasLdsBranchVmemWARHazard; 951 } 952 hasNSAtoVMEMBug()953 bool hasNSAtoVMEMBug() const { 954 return HasNSAtoVMEMBug; 955 } 956 hasNSAClauseBug()957 bool hasNSAClauseBug() const { return HasNSAClauseBug; } 958 hasHardClauses()959 bool hasHardClauses() const { return getGeneration() >= GFX10; } 960 hasGFX90AInsts()961 bool hasGFX90AInsts() const { return GFX90AInsts; } 962 963 /// Return if operations acting on VGPR tuples require even alignment. needsAlignedVGPRs()964 bool needsAlignedVGPRs() const { return GFX90AInsts; } 965 hasPackedTID()966 bool hasPackedTID() const { return HasPackedTID; } 967 968 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 969 /// SGPRs 970 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 971 972 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 973 /// VGPRs 974 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 975 976 /// Return occupancy for the given function. Used LDS and a number of 977 /// registers if provided. 978 /// Note, occupancy can be affected by the scratch allocation as well, but 979 /// we do not have enough information to compute it. 980 unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0, 981 unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const; 982 983 /// \returns true if the flat_scratch register should be initialized with the 984 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()985 bool flatScratchIsPointer() const { 986 return getGeneration() >= AMDGPUSubtarget::GFX9; 987 } 988 989 /// \returns true if the flat_scratch register is initialized by the HW. 990 /// In this case it is readonly. flatScratchIsArchitected()991 bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; } 992 993 /// \returns true if the machine has merged shaders in which s0-s7 are 994 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()995 bool hasMergedShaders() const { 996 return getGeneration() >= GFX9; 997 } 998 999 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()1000 unsigned getSGPRAllocGranule() const { 1001 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 1002 } 1003 1004 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()1005 unsigned getSGPREncodingGranule() const { 1006 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 1007 } 1008 1009 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()1010 unsigned getTotalNumSGPRs() const { 1011 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 1012 } 1013 1014 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()1015 unsigned getAddressableNumSGPRs() const { 1016 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 1017 } 1018 1019 /// \returns Minimum number of SGPRs that meets the given number of waves per 1020 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)1021 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 1022 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 1023 } 1024 1025 /// \returns Maximum number of SGPRs that meets the given number of waves per 1026 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1027 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 1028 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 1029 } 1030 1031 /// \returns Reserved number of SGPRs for given function \p MF. 1032 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 1033 1034 /// \returns Maximum number of SGPRs that meets number of waves per execution 1035 /// unit requirement for function \p MF, or number of SGPRs explicitly 1036 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 1037 /// 1038 /// \returns Value that meets number of waves per execution unit requirement 1039 /// if explicitly requested value cannot be converted to integer, violates 1040 /// subtarget's specifications, or does not meet number of waves per execution 1041 /// unit requirement. 1042 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 1043 1044 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule()1045 unsigned getVGPRAllocGranule() const { 1046 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 1047 } 1048 1049 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()1050 unsigned getVGPREncodingGranule() const { 1051 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 1052 } 1053 1054 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()1055 unsigned getTotalNumVGPRs() const { 1056 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 1057 } 1058 1059 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs()1060 unsigned getAddressableNumVGPRs() const { 1061 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 1062 } 1063 1064 /// \returns Minimum number of VGPRs that meets given number of waves per 1065 /// execution unit requirement supported by the subtarget. getMinNumVGPRs(unsigned WavesPerEU)1066 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 1067 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 1068 } 1069 1070 /// \returns Maximum number of VGPRs that meets given number of waves per 1071 /// execution unit requirement supported by the subtarget. getMaxNumVGPRs(unsigned WavesPerEU)1072 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 1073 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 1074 } 1075 1076 /// \returns Maximum number of VGPRs that meets number of waves per execution 1077 /// unit requirement for function \p MF, or number of VGPRs explicitly 1078 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 1079 /// 1080 /// \returns Value that meets number of waves per execution unit requirement 1081 /// if explicitly requested value cannot be converted to integer, violates 1082 /// subtarget's specifications, or does not meet number of waves per execution 1083 /// unit requirement. 1084 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 1085 1086 void getPostRAMutations( 1087 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 1088 const override; 1089 isWave32()1090 bool isWave32() const { 1091 return getWavefrontSize() == 32; 1092 } 1093 isWave64()1094 bool isWave64() const { 1095 return getWavefrontSize() == 64; 1096 } 1097 getBoolRC()1098 const TargetRegisterClass *getBoolRC() const { 1099 return getRegisterInfo()->getBoolRC(); 1100 } 1101 1102 /// \returns Maximum number of work groups per compute unit supported by the 1103 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1104 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1105 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1106 } 1107 1108 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1109 unsigned getMinFlatWorkGroupSize() const override { 1110 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1111 } 1112 1113 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1114 unsigned getMaxFlatWorkGroupSize() const override { 1115 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1116 } 1117 1118 /// \returns Number of waves per execution unit required to support the given 1119 /// \p FlatWorkGroupSize. 1120 unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1121 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override { 1122 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize); 1123 } 1124 1125 /// \returns Minimum number of waves per execution unit supported by the 1126 /// subtarget. getMinWavesPerEU()1127 unsigned getMinWavesPerEU() const override { 1128 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1129 } 1130 1131 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, 1132 SDep &Dep) const override; 1133 }; 1134 1135 } // end namespace llvm 1136 1137 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H 1138