1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====// 2 // 3 // The LLVM Compiler Infrastructure 4 // 5 // This file is distributed under the University of Illinois Open Source 6 // License. See LICENSE.TXT for details. 7 // 8 //==-----------------------------------------------------------------------===// 9 // 10 /// \file 11 /// AMDGPU specific subclass of TargetSubtarget. 12 // 13 //===----------------------------------------------------------------------===// 14 15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 17 18 #include "AMDGPU.h" 19 #include "AMDGPUCallLowering.h" 20 #include "R600FrameLowering.h" 21 #include "R600ISelLowering.h" 22 #include "R600InstrInfo.h" 23 #include "SIFrameLowering.h" 24 #include "SIISelLowering.h" 25 #include "SIInstrInfo.h" 26 #include "Utils/AMDGPUBaseInfo.h" 27 #include "llvm/ADT/Triple.h" 28 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" 29 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" 30 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" 31 #include "llvm/CodeGen/MachineFunction.h" 32 #include "llvm/CodeGen/SelectionDAGTargetInfo.h" 33 #include "llvm/MC/MCInstrItineraries.h" 34 #include "llvm/Support/MathExtras.h" 35 #include <cassert> 36 #include <cstdint> 37 #include <memory> 38 #include <utility> 39 40 #define GET_SUBTARGETINFO_HEADER 41 #include "AMDGPUGenSubtargetInfo.inc" 42 #define GET_SUBTARGETINFO_HEADER 43 #include "R600GenSubtargetInfo.inc" 44 45 namespace llvm { 46 47 class StringRef; 48 49 class AMDGPUSubtarget { 50 public: 51 enum Generation { 52 R600 = 0, 53 R700 = 1, 54 EVERGREEN = 2, 55 NORTHERN_ISLANDS = 3, 56 SOUTHERN_ISLANDS = 4, 57 SEA_ISLANDS = 5, 58 VOLCANIC_ISLANDS = 6, 59 GFX9 = 7 60 }; 61 62 private: 63 Triple TargetTriple; 64 65 protected: 66 bool Has16BitInsts; 67 bool HasMadMixInsts; 68 bool FP32Denormals; 69 bool FPExceptions; 70 bool HasSDWA; 71 bool HasVOP3PInsts; 72 bool HasMulI24; 73 bool HasMulU24; 74 bool HasInv2PiInlineImm; 75 bool HasFminFmaxLegacy; 76 bool EnablePromoteAlloca; 77 bool HasTrigReducedRange; 78 int LocalMemorySize; 79 unsigned WavefrontSize; 80 81 public: 82 AMDGPUSubtarget(const Triple &TT); 83 84 static const AMDGPUSubtarget &get(const MachineFunction &MF); 85 static const AMDGPUSubtarget &get(const TargetMachine &TM, 86 const Function &F); 87 88 /// \returns Default range flat work group size for a calling convention. 89 std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const; 90 91 /// \returns Subtarget's default pair of minimum/maximum flat work group sizes 92 /// for function \p F, or minimum/maximum flat work group sizes explicitly 93 /// requested using "amdgpu-flat-work-group-size" attribute attached to 94 /// function \p F. 95 /// 96 /// \returns Subtarget's default values if explicitly requested values cannot 97 /// be converted to integer, or violate subtarget's specifications. 98 std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const; 99 100 /// \returns Subtarget's default pair of minimum/maximum number of waves per 101 /// execution unit for function \p F, or minimum/maximum number of waves per 102 /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute 103 /// attached to function \p F. 104 /// 105 /// \returns Subtarget's default values if explicitly requested values cannot 106 /// be converted to integer, violate subtarget's specifications, or are not 107 /// compatible with minimum/maximum number of waves limited by flat work group 108 /// size, register usage, and/or lds usage. 109 std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const; 110 111 /// Return the amount of LDS that can be used that will not restrict the 112 /// occupancy lower than WaveCount. 113 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 114 const Function &) const; 115 116 /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if 117 /// the given LDS memory size is the only constraint. 118 unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const; 119 120 unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const; 121 isAmdHsaOS()122 bool isAmdHsaOS() const { 123 return TargetTriple.getOS() == Triple::AMDHSA; 124 } 125 isAmdPalOS()126 bool isAmdPalOS() const { 127 return TargetTriple.getOS() == Triple::AMDPAL; 128 } 129 isMesa3DOS()130 bool isMesa3DOS() const { 131 return TargetTriple.getOS() == Triple::Mesa3D; 132 } 133 isMesaKernel(const Function & F)134 bool isMesaKernel(const Function &F) const { 135 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv()); 136 } 137 isAmdHsaOrMesa(const Function & F)138 bool isAmdHsaOrMesa(const Function &F) const { 139 return isAmdHsaOS() || isMesaKernel(F); 140 } 141 has16BitInsts()142 bool has16BitInsts() const { 143 return Has16BitInsts; 144 } 145 hasMadMixInsts()146 bool hasMadMixInsts() const { 147 return HasMadMixInsts; 148 } 149 hasFP32Denormals()150 bool hasFP32Denormals() const { 151 return FP32Denormals; 152 } 153 hasFPExceptions()154 bool hasFPExceptions() const { 155 return FPExceptions; 156 } 157 hasSDWA()158 bool hasSDWA() const { 159 return HasSDWA; 160 } 161 hasVOP3PInsts()162 bool hasVOP3PInsts() const { 163 return HasVOP3PInsts; 164 } 165 hasMulI24()166 bool hasMulI24() const { 167 return HasMulI24; 168 } 169 hasMulU24()170 bool hasMulU24() const { 171 return HasMulU24; 172 } 173 hasInv2PiInlineImm()174 bool hasInv2PiInlineImm() const { 175 return HasInv2PiInlineImm; 176 } 177 hasFminFmaxLegacy()178 bool hasFminFmaxLegacy() const { 179 return HasFminFmaxLegacy; 180 } 181 hasTrigReducedRange()182 bool hasTrigReducedRange() const { 183 return HasTrigReducedRange; 184 } 185 isPromoteAllocaEnabled()186 bool isPromoteAllocaEnabled() const { 187 return EnablePromoteAlloca; 188 } 189 getWavefrontSize()190 unsigned getWavefrontSize() const { 191 return WavefrontSize; 192 } 193 getLocalMemorySize()194 int getLocalMemorySize() const { 195 return LocalMemorySize; 196 } 197 getAlignmentForImplicitArgPtr()198 unsigned getAlignmentForImplicitArgPtr() const { 199 return isAmdHsaOS() ? 8 : 4; 200 } 201 202 /// Returns the offset in bytes from the start of the input buffer 203 /// of the first explicit kernel argument. getExplicitKernelArgOffset(const Function & F)204 unsigned getExplicitKernelArgOffset(const Function &F) const { 205 return isAmdHsaOrMesa(F) ? 0 : 36; 206 } 207 208 /// \returns Maximum number of work groups per compute unit supported by the 209 /// subtarget and limited by given \p FlatWorkGroupSize. 210 virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0; 211 212 /// \returns Minimum flat work group size supported by the subtarget. 213 virtual unsigned getMinFlatWorkGroupSize() const = 0; 214 215 /// \returns Maximum flat work group size supported by the subtarget. 216 virtual unsigned getMaxFlatWorkGroupSize() const = 0; 217 218 /// \returns Maximum number of waves per execution unit supported by the 219 /// subtarget and limited by given \p FlatWorkGroupSize. 220 virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const = 0; 221 222 /// \returns Minimum number of waves per execution unit supported by the 223 /// subtarget. 224 virtual unsigned getMinWavesPerEU() const = 0; 225 getMaxWavesPerEU()226 unsigned getMaxWavesPerEU() const { return 10; } 227 228 /// Creates value range metadata on an workitemid.* inrinsic call or load. 229 bool makeLIDRangeMetadata(Instruction *I) const; 230 231 /// \returns Number of bytes of arguments that are passed to a shader or 232 /// kernel in addition to the explicit ones declared for the function. getImplicitArgNumBytes(const Function & F)233 unsigned getImplicitArgNumBytes(const Function &F) const { 234 if (isMesaKernel(F)) 235 return 16; 236 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0); 237 } 238 uint64_t getExplicitKernArgSize(const Function &F, 239 unsigned &MaxAlign) const; 240 unsigned getKernArgSegmentSize(const Function &F, 241 unsigned &MaxAlign) const; 242 ~AMDGPUSubtarget()243 virtual ~AMDGPUSubtarget() {} 244 }; 245 246 class GCNSubtarget : public AMDGPUGenSubtargetInfo, 247 public AMDGPUSubtarget { 248 public: 249 enum { 250 ISAVersion0_0_0, 251 ISAVersion6_0_0, 252 ISAVersion6_0_1, 253 ISAVersion7_0_0, 254 ISAVersion7_0_1, 255 ISAVersion7_0_2, 256 ISAVersion7_0_3, 257 ISAVersion7_0_4, 258 ISAVersion8_0_1, 259 ISAVersion8_0_2, 260 ISAVersion8_0_3, 261 ISAVersion8_1_0, 262 ISAVersion9_0_0, 263 ISAVersion9_0_2, 264 ISAVersion9_0_4, 265 ISAVersion9_0_6, 266 ISAVersion9_0_9, 267 }; 268 269 enum TrapHandlerAbi { 270 TrapHandlerAbiNone = 0, 271 TrapHandlerAbiHsa = 1 272 }; 273 274 enum TrapID { 275 TrapIDHardwareReserved = 0, 276 TrapIDHSADebugTrap = 1, 277 TrapIDLLVMTrap = 2, 278 TrapIDLLVMDebugTrap = 3, 279 TrapIDDebugBreakpoint = 7, 280 TrapIDDebugReserved8 = 8, 281 TrapIDDebugReservedFE = 0xfe, 282 TrapIDDebugReservedFF = 0xff 283 }; 284 285 enum TrapRegValues { 286 LLVMTrapHandlerRegValue = 1 287 }; 288 289 private: 290 /// GlobalISel related APIs. 291 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; 292 std::unique_ptr<InstructionSelector> InstSelector; 293 std::unique_ptr<LegalizerInfo> Legalizer; 294 std::unique_ptr<RegisterBankInfo> RegBankInfo; 295 296 protected: 297 // Basic subtarget description. 298 Triple TargetTriple; 299 unsigned Gen; 300 unsigned IsaVersion; 301 InstrItineraryData InstrItins; 302 int LDSBankCount; 303 unsigned MaxPrivateElementSize; 304 305 // Possibly statically set by tablegen, but may want to be overridden. 306 bool FastFMAF32; 307 bool HalfRate64Ops; 308 309 // Dynamially set bits that enable features. 310 bool FP64FP16Denormals; 311 bool DX10Clamp; 312 bool FlatForGlobal; 313 bool AutoWaitcntBeforeBarrier; 314 bool CodeObjectV3; 315 bool UnalignedScratchAccess; 316 bool UnalignedBufferAccess; 317 bool HasApertureRegs; 318 bool EnableXNACK; 319 bool TrapHandler; 320 bool DebuggerInsertNops; 321 bool DebuggerEmitPrologue; 322 323 // Used as options. 324 bool EnableHugePrivateBuffer; 325 bool EnableLoadStoreOpt; 326 bool EnableUnsafeDSOffsetFolding; 327 bool EnableSIScheduler; 328 bool EnableDS128; 329 bool EnablePRTStrictNull; 330 bool DumpCode; 331 332 // Subtarget statically properties set by tablegen 333 bool FP64; 334 bool FMA; 335 bool MIMG_R128; 336 bool IsGCN; 337 bool GCN3Encoding; 338 bool CIInsts; 339 bool VIInsts; 340 bool GFX9Insts; 341 bool SGPRInitBug; 342 bool HasSMemRealTime; 343 bool HasIntClamp; 344 bool HasFmaMixInsts; 345 bool HasMovrel; 346 bool HasVGPRIndexMode; 347 bool HasScalarStores; 348 bool HasScalarAtomics; 349 bool HasSDWAOmod; 350 bool HasSDWAScalar; 351 bool HasSDWASdst; 352 bool HasSDWAMac; 353 bool HasSDWAOutModsVOPC; 354 bool HasDPP; 355 bool HasR128A16; 356 bool HasDLInsts; 357 bool HasDotInsts; 358 bool EnableSRAMECC; 359 bool FlatAddressSpace; 360 bool FlatInstOffsets; 361 bool FlatGlobalInsts; 362 bool FlatScratchInsts; 363 bool AddNoCarryInsts; 364 bool HasUnpackedD16VMem; 365 bool R600ALUInst; 366 bool CaymanISA; 367 bool CFALUBug; 368 bool HasVertexCache; 369 short TexVTXClauseSize; 370 bool ScalarizeGlobal; 371 372 // Dummy feature to use for assembler in tablegen. 373 bool FeatureDisable; 374 375 SelectionDAGTargetInfo TSInfo; 376 private: 377 SIInstrInfo InstrInfo; 378 SITargetLowering TLInfo; 379 SIFrameLowering FrameLowering; 380 381 public: 382 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, 383 const GCNTargetMachine &TM); 384 ~GCNSubtarget() override; 385 386 GCNSubtarget &initializeSubtargetDependencies(const Triple &TT, 387 StringRef GPU, StringRef FS); 388 getInstrInfo()389 const SIInstrInfo *getInstrInfo() const override { 390 return &InstrInfo; 391 } 392 getFrameLowering()393 const SIFrameLowering *getFrameLowering() const override { 394 return &FrameLowering; 395 } 396 getTargetLowering()397 const SITargetLowering *getTargetLowering() const override { 398 return &TLInfo; 399 } 400 getRegisterInfo()401 const SIRegisterInfo *getRegisterInfo() const override { 402 return &InstrInfo.getRegisterInfo(); 403 } 404 getCallLowering()405 const CallLowering *getCallLowering() const override { 406 return CallLoweringInfo.get(); 407 } 408 getInstructionSelector()409 const InstructionSelector *getInstructionSelector() const override { 410 return InstSelector.get(); 411 } 412 getLegalizerInfo()413 const LegalizerInfo *getLegalizerInfo() const override { 414 return Legalizer.get(); 415 } 416 getRegBankInfo()417 const RegisterBankInfo *getRegBankInfo() const override { 418 return RegBankInfo.get(); 419 } 420 421 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()422 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 423 return &TSInfo; 424 } 425 getInstrItineraryData()426 const InstrItineraryData *getInstrItineraryData() const override { 427 return &InstrItins; 428 } 429 430 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 431 getGeneration()432 Generation getGeneration() const { 433 return (Generation)Gen; 434 } 435 getWavefrontSizeLog2()436 unsigned getWavefrontSizeLog2() const { 437 return Log2_32(WavefrontSize); 438 } 439 getLDSBankCount()440 int getLDSBankCount() const { 441 return LDSBankCount; 442 } 443 getMaxPrivateElementSize()444 unsigned getMaxPrivateElementSize() const { 445 return MaxPrivateElementSize; 446 } 447 hasIntClamp()448 bool hasIntClamp() const { 449 return HasIntClamp; 450 } 451 hasFP64()452 bool hasFP64() const { 453 return FP64; 454 } 455 hasMIMG_R128()456 bool hasMIMG_R128() const { 457 return MIMG_R128; 458 } 459 hasHWFP64()460 bool hasHWFP64() const { 461 return FP64; 462 } 463 hasFastFMAF32()464 bool hasFastFMAF32() const { 465 return FastFMAF32; 466 } 467 hasHalfRate64Ops()468 bool hasHalfRate64Ops() const { 469 return HalfRate64Ops; 470 } 471 hasAddr64()472 bool hasAddr64() const { 473 return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS); 474 } 475 hasBFE()476 bool hasBFE() const { 477 return true; 478 } 479 hasBFI()480 bool hasBFI() const { 481 return true; 482 } 483 hasBFM()484 bool hasBFM() const { 485 return hasBFE(); 486 } 487 hasBCNT(unsigned Size)488 bool hasBCNT(unsigned Size) const { 489 return true; 490 } 491 hasFFBL()492 bool hasFFBL() const { 493 return true; 494 } 495 hasFFBH()496 bool hasFFBH() const { 497 return true; 498 } 499 hasMed3_16()500 bool hasMed3_16() const { 501 return getGeneration() >= AMDGPUSubtarget::GFX9; 502 } 503 hasMin3Max3_16()504 bool hasMin3Max3_16() const { 505 return getGeneration() >= AMDGPUSubtarget::GFX9; 506 } 507 hasFmaMixInsts()508 bool hasFmaMixInsts() const { 509 return HasFmaMixInsts; 510 } 511 hasCARRY()512 bool hasCARRY() const { 513 return true; 514 } 515 hasFMA()516 bool hasFMA() const { 517 return FMA; 518 } 519 hasSwap()520 bool hasSwap() const { 521 return GFX9Insts; 522 } 523 getTrapHandlerAbi()524 TrapHandlerAbi getTrapHandlerAbi() const { 525 return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone; 526 } 527 enableHugePrivateBuffer()528 bool enableHugePrivateBuffer() const { 529 return EnableHugePrivateBuffer; 530 } 531 unsafeDSOffsetFoldingEnabled()532 bool unsafeDSOffsetFoldingEnabled() const { 533 return EnableUnsafeDSOffsetFolding; 534 } 535 dumpCode()536 bool dumpCode() const { 537 return DumpCode; 538 } 539 540 /// Return the amount of LDS that can be used that will not restrict the 541 /// occupancy lower than WaveCount. 542 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, 543 const Function &) const; 544 hasFP16Denormals()545 bool hasFP16Denormals() const { 546 return FP64FP16Denormals; 547 } 548 hasFP64Denormals()549 bool hasFP64Denormals() const { 550 return FP64FP16Denormals; 551 } 552 supportsMinMaxDenormModes()553 bool supportsMinMaxDenormModes() const { 554 return getGeneration() >= AMDGPUSubtarget::GFX9; 555 } 556 enableDX10Clamp()557 bool enableDX10Clamp() const { 558 return DX10Clamp; 559 } 560 enableIEEEBit(const MachineFunction & MF)561 bool enableIEEEBit(const MachineFunction &MF) const { 562 return AMDGPU::isCompute(MF.getFunction().getCallingConv()); 563 } 564 useFlatForGlobal()565 bool useFlatForGlobal() const { 566 return FlatForGlobal; 567 } 568 569 /// \returns If target supports ds_read/write_b128 and user enables generation 570 /// of ds_read/write_b128. useDS128()571 bool useDS128() const { 572 return CIInsts && EnableDS128; 573 } 574 575 /// \returns If MUBUF instructions always perform range checking, even for 576 /// buffer resources used for private memory access. privateMemoryResourceIsRangeChecked()577 bool privateMemoryResourceIsRangeChecked() const { 578 return getGeneration() < AMDGPUSubtarget::GFX9; 579 } 580 581 /// \returns If target requires PRT Struct NULL support (zero result registers 582 /// for sparse texture support). usePRTStrictNull()583 bool usePRTStrictNull() const { 584 return EnablePRTStrictNull; 585 } 586 hasAutoWaitcntBeforeBarrier()587 bool hasAutoWaitcntBeforeBarrier() const { 588 return AutoWaitcntBeforeBarrier; 589 } 590 hasCodeObjectV3()591 bool hasCodeObjectV3() const { 592 // FIXME: Need to add code object v3 support for mesa and pal. 593 return isAmdHsaOS() ? CodeObjectV3 : false; 594 } 595 hasUnalignedBufferAccess()596 bool hasUnalignedBufferAccess() const { 597 return UnalignedBufferAccess; 598 } 599 hasUnalignedScratchAccess()600 bool hasUnalignedScratchAccess() const { 601 return UnalignedScratchAccess; 602 } 603 hasApertureRegs()604 bool hasApertureRegs() const { 605 return HasApertureRegs; 606 } 607 isTrapHandlerEnabled()608 bool isTrapHandlerEnabled() const { 609 return TrapHandler; 610 } 611 isXNACKEnabled()612 bool isXNACKEnabled() const { 613 return EnableXNACK; 614 } 615 hasFlatAddressSpace()616 bool hasFlatAddressSpace() const { 617 return FlatAddressSpace; 618 } 619 hasFlatInstOffsets()620 bool hasFlatInstOffsets() const { 621 return FlatInstOffsets; 622 } 623 hasFlatGlobalInsts()624 bool hasFlatGlobalInsts() const { 625 return FlatGlobalInsts; 626 } 627 hasFlatScratchInsts()628 bool hasFlatScratchInsts() const { 629 return FlatScratchInsts; 630 } 631 hasFlatLgkmVMemCountInOrder()632 bool hasFlatLgkmVMemCountInOrder() const { 633 return getGeneration() > GFX9; 634 } 635 hasD16LoadStore()636 bool hasD16LoadStore() const { 637 return getGeneration() >= GFX9; 638 } 639 640 /// Return if most LDS instructions have an m0 use that require m0 to be 641 /// iniitalized. ldsRequiresM0Init()642 bool ldsRequiresM0Init() const { 643 return getGeneration() < GFX9; 644 } 645 hasAddNoCarry()646 bool hasAddNoCarry() const { 647 return AddNoCarryInsts; 648 } 649 hasUnpackedD16VMem()650 bool hasUnpackedD16VMem() const { 651 return HasUnpackedD16VMem; 652 } 653 654 // Covers VS/PS/CS graphics shaders isMesaGfxShader(const Function & F)655 bool isMesaGfxShader(const Function &F) const { 656 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv()); 657 } 658 hasMad64_32()659 bool hasMad64_32() const { 660 return getGeneration() >= SEA_ISLANDS; 661 } 662 hasSDWAOmod()663 bool hasSDWAOmod() const { 664 return HasSDWAOmod; 665 } 666 hasSDWAScalar()667 bool hasSDWAScalar() const { 668 return HasSDWAScalar; 669 } 670 hasSDWASdst()671 bool hasSDWASdst() const { 672 return HasSDWASdst; 673 } 674 hasSDWAMac()675 bool hasSDWAMac() const { 676 return HasSDWAMac; 677 } 678 hasSDWAOutModsVOPC()679 bool hasSDWAOutModsVOPC() const { 680 return HasSDWAOutModsVOPC; 681 } 682 vmemWriteNeedsExpWaitcnt()683 bool vmemWriteNeedsExpWaitcnt() const { 684 return getGeneration() < SEA_ISLANDS; 685 } 686 hasDLInsts()687 bool hasDLInsts() const { 688 return HasDLInsts; 689 } 690 hasDotInsts()691 bool hasDotInsts() const { 692 return HasDotInsts; 693 } 694 isSRAMECCEnabled()695 bool isSRAMECCEnabled() const { 696 return EnableSRAMECC; 697 } 698 699 // Scratch is allocated in 256 dword per wave blocks for the entire 700 // wavefront. When viewed from the perspecive of an arbitrary workitem, this 701 // is 4-byte aligned. 702 // 703 // Only 4-byte alignment is really needed to access anything. Transformations 704 // on the pointer value itself may rely on the alignment / known low bits of 705 // the pointer. Set this to something above the minimum to avoid needing 706 // dynamic realignment in common cases. getStackAlignment()707 unsigned getStackAlignment() const { 708 return 16; 709 } 710 enableMachineScheduler()711 bool enableMachineScheduler() const override { 712 return true; 713 } 714 enableSubRegLiveness()715 bool enableSubRegLiveness() const override { 716 return true; 717 } 718 setScalarizeGlobalBehavior(bool b)719 void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; } getScalarizeGlobalBehavior()720 bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; } 721 722 /// \returns Number of execution units per compute unit supported by the 723 /// subtarget. getEUsPerCU()724 unsigned getEUsPerCU() const { 725 return AMDGPU::IsaInfo::getEUsPerCU(this); 726 } 727 728 /// \returns Maximum number of waves per compute unit supported by the 729 /// subtarget without any kind of limitation. getMaxWavesPerCU()730 unsigned getMaxWavesPerCU() const { 731 return AMDGPU::IsaInfo::getMaxWavesPerCU(this); 732 } 733 734 /// \returns Maximum number of waves per compute unit supported by the 735 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWavesPerCU(unsigned FlatWorkGroupSize)736 unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const { 737 return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize); 738 } 739 740 /// \returns Maximum number of waves per execution unit supported by the 741 /// subtarget without any kind of limitation. getMaxWavesPerEU()742 unsigned getMaxWavesPerEU() const { 743 return AMDGPU::IsaInfo::getMaxWavesPerEU(); 744 } 745 746 /// \returns Number of waves per work group supported by the subtarget and 747 /// limited by given \p FlatWorkGroupSize. getWavesPerWorkGroup(unsigned FlatWorkGroupSize)748 unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const { 749 return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize); 750 } 751 752 // static wrappers 753 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI); 754 755 // XXX - Why is this here if it isn't in the default pass set? enableEarlyIfConversion()756 bool enableEarlyIfConversion() const override { 757 return true; 758 } 759 760 void overrideSchedPolicy(MachineSchedPolicy &Policy, 761 unsigned NumRegionInstrs) const override; 762 getMaxNumUserSGPRs()763 unsigned getMaxNumUserSGPRs() const { 764 return 16; 765 } 766 hasSMemRealTime()767 bool hasSMemRealTime() const { 768 return HasSMemRealTime; 769 } 770 hasMovrel()771 bool hasMovrel() const { 772 return HasMovrel; 773 } 774 hasVGPRIndexMode()775 bool hasVGPRIndexMode() const { 776 return HasVGPRIndexMode; 777 } 778 useVGPRIndexMode(bool UserEnable)779 bool useVGPRIndexMode(bool UserEnable) const { 780 return !hasMovrel() || (UserEnable && hasVGPRIndexMode()); 781 } 782 hasScalarCompareEq64()783 bool hasScalarCompareEq64() const { 784 return getGeneration() >= VOLCANIC_ISLANDS; 785 } 786 hasScalarStores()787 bool hasScalarStores() const { 788 return HasScalarStores; 789 } 790 hasScalarAtomics()791 bool hasScalarAtomics() const { 792 return HasScalarAtomics; 793 } 794 795 hasDPP()796 bool hasDPP() const { 797 return HasDPP; 798 } 799 hasR128A16()800 bool hasR128A16() const { 801 return HasR128A16; 802 } 803 enableSIScheduler()804 bool enableSIScheduler() const { 805 return EnableSIScheduler; 806 } 807 debuggerSupported()808 bool debuggerSupported() const { 809 return debuggerInsertNops() && debuggerEmitPrologue(); 810 } 811 debuggerInsertNops()812 bool debuggerInsertNops() const { 813 return DebuggerInsertNops; 814 } 815 debuggerEmitPrologue()816 bool debuggerEmitPrologue() const { 817 return DebuggerEmitPrologue; 818 } 819 loadStoreOptEnabled()820 bool loadStoreOptEnabled() const { 821 return EnableLoadStoreOpt; 822 } 823 hasSGPRInitBug()824 bool hasSGPRInitBug() const { 825 return SGPRInitBug; 826 } 827 has12DWordStoreHazard()828 bool has12DWordStoreHazard() const { 829 return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS; 830 } 831 832 // \returns true if the subtarget supports DWORDX3 load/store instructions. hasDwordx3LoadStores()833 bool hasDwordx3LoadStores() const { 834 return CIInsts; 835 } 836 hasSMovFedHazard()837 bool hasSMovFedHazard() const { 838 return getGeneration() >= AMDGPUSubtarget::GFX9; 839 } 840 hasReadM0MovRelInterpHazard()841 bool hasReadM0MovRelInterpHazard() const { 842 return getGeneration() >= AMDGPUSubtarget::GFX9; 843 } 844 hasReadM0SendMsgHazard()845 bool hasReadM0SendMsgHazard() const { 846 return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS; 847 } 848 849 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs 850 /// SGPRs 851 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const; 852 853 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs 854 /// VGPRs 855 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const; 856 857 /// \returns true if the flat_scratch register should be initialized with the 858 /// pointer to the wave's scratch memory rather than a size and offset. flatScratchIsPointer()859 bool flatScratchIsPointer() const { 860 return getGeneration() >= AMDGPUSubtarget::GFX9; 861 } 862 863 /// \returns true if the machine has merged shaders in which s0-s7 are 864 /// reserved by the hardware and user SGPRs start at s8 hasMergedShaders()865 bool hasMergedShaders() const { 866 return getGeneration() >= GFX9; 867 } 868 869 /// \returns SGPR allocation granularity supported by the subtarget. getSGPRAllocGranule()870 unsigned getSGPRAllocGranule() const { 871 return AMDGPU::IsaInfo::getSGPRAllocGranule(this); 872 } 873 874 /// \returns SGPR encoding granularity supported by the subtarget. getSGPREncodingGranule()875 unsigned getSGPREncodingGranule() const { 876 return AMDGPU::IsaInfo::getSGPREncodingGranule(this); 877 } 878 879 /// \returns Total number of SGPRs supported by the subtarget. getTotalNumSGPRs()880 unsigned getTotalNumSGPRs() const { 881 return AMDGPU::IsaInfo::getTotalNumSGPRs(this); 882 } 883 884 /// \returns Addressable number of SGPRs supported by the subtarget. getAddressableNumSGPRs()885 unsigned getAddressableNumSGPRs() const { 886 return AMDGPU::IsaInfo::getAddressableNumSGPRs(this); 887 } 888 889 /// \returns Minimum number of SGPRs that meets the given number of waves per 890 /// execution unit requirement supported by the subtarget. getMinNumSGPRs(unsigned WavesPerEU)891 unsigned getMinNumSGPRs(unsigned WavesPerEU) const { 892 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU); 893 } 894 895 /// \returns Maximum number of SGPRs that meets the given number of waves per 896 /// execution unit requirement supported by the subtarget. getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)897 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const { 898 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable); 899 } 900 901 /// \returns Reserved number of SGPRs for given function \p MF. 902 unsigned getReservedNumSGPRs(const MachineFunction &MF) const; 903 904 /// \returns Maximum number of SGPRs that meets number of waves per execution 905 /// unit requirement for function \p MF, or number of SGPRs explicitly 906 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF. 907 /// 908 /// \returns Value that meets number of waves per execution unit requirement 909 /// if explicitly requested value cannot be converted to integer, violates 910 /// subtarget's specifications, or does not meet number of waves per execution 911 /// unit requirement. 912 unsigned getMaxNumSGPRs(const MachineFunction &MF) const; 913 914 /// \returns VGPR allocation granularity supported by the subtarget. getVGPRAllocGranule()915 unsigned getVGPRAllocGranule() const { 916 return AMDGPU::IsaInfo::getVGPRAllocGranule(this); 917 } 918 919 /// \returns VGPR encoding granularity supported by the subtarget. getVGPREncodingGranule()920 unsigned getVGPREncodingGranule() const { 921 return AMDGPU::IsaInfo::getVGPREncodingGranule(this); 922 } 923 924 /// \returns Total number of VGPRs supported by the subtarget. getTotalNumVGPRs()925 unsigned getTotalNumVGPRs() const { 926 return AMDGPU::IsaInfo::getTotalNumVGPRs(this); 927 } 928 929 /// \returns Addressable number of VGPRs supported by the subtarget. getAddressableNumVGPRs()930 unsigned getAddressableNumVGPRs() const { 931 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this); 932 } 933 934 /// \returns Minimum number of VGPRs that meets given number of waves per 935 /// execution unit requirement supported by the subtarget. getMinNumVGPRs(unsigned WavesPerEU)936 unsigned getMinNumVGPRs(unsigned WavesPerEU) const { 937 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU); 938 } 939 940 /// \returns Maximum number of VGPRs that meets given number of waves per 941 /// execution unit requirement supported by the subtarget. getMaxNumVGPRs(unsigned WavesPerEU)942 unsigned getMaxNumVGPRs(unsigned WavesPerEU) const { 943 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU); 944 } 945 946 /// \returns Maximum number of VGPRs that meets number of waves per execution 947 /// unit requirement for function \p MF, or number of VGPRs explicitly 948 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF. 949 /// 950 /// \returns Value that meets number of waves per execution unit requirement 951 /// if explicitly requested value cannot be converted to integer, violates 952 /// subtarget's specifications, or does not meet number of waves per execution 953 /// unit requirement. 954 unsigned getMaxNumVGPRs(const MachineFunction &MF) const; 955 956 void getPostRAMutations( 957 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) 958 const override; 959 960 /// \returns Maximum number of work groups per compute unit supported by the 961 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)962 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 963 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 964 } 965 966 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()967 unsigned getMinFlatWorkGroupSize() const override { 968 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 969 } 970 971 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()972 unsigned getMaxFlatWorkGroupSize() const override { 973 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 974 } 975 976 /// \returns Maximum number of waves per execution unit supported by the 977 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWavesPerEU(unsigned FlatWorkGroupSize)978 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 979 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 980 } 981 982 /// \returns Minimum number of waves per execution unit supported by the 983 /// subtarget. getMinWavesPerEU()984 unsigned getMinWavesPerEU() const override { 985 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 986 } 987 }; 988 989 class R600Subtarget final : public R600GenSubtargetInfo, 990 public AMDGPUSubtarget { 991 private: 992 R600InstrInfo InstrInfo; 993 R600FrameLowering FrameLowering; 994 bool FMA; 995 bool CaymanISA; 996 bool CFALUBug; 997 bool DX10Clamp; 998 bool HasVertexCache; 999 bool R600ALUInst; 1000 bool FP64; 1001 short TexVTXClauseSize; 1002 Generation Gen; 1003 R600TargetLowering TLInfo; 1004 InstrItineraryData InstrItins; 1005 SelectionDAGTargetInfo TSInfo; 1006 1007 public: 1008 R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS, 1009 const TargetMachine &TM); 1010 getInstrInfo()1011 const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; } 1012 getFrameLowering()1013 const R600FrameLowering *getFrameLowering() const override { 1014 return &FrameLowering; 1015 } 1016 getTargetLowering()1017 const R600TargetLowering *getTargetLowering() const override { 1018 return &TLInfo; 1019 } 1020 getRegisterInfo()1021 const R600RegisterInfo *getRegisterInfo() const override { 1022 return &InstrInfo.getRegisterInfo(); 1023 } 1024 getInstrItineraryData()1025 const InstrItineraryData *getInstrItineraryData() const override { 1026 return &InstrItins; 1027 } 1028 1029 // Nothing implemented, just prevent crashes on use. getSelectionDAGInfo()1030 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { 1031 return &TSInfo; 1032 } 1033 1034 void ParseSubtargetFeatures(StringRef CPU, StringRef FS); 1035 getGeneration()1036 Generation getGeneration() const { 1037 return Gen; 1038 } 1039 getStackAlignment()1040 unsigned getStackAlignment() const { 1041 return 4; 1042 } 1043 1044 R600Subtarget &initializeSubtargetDependencies(const Triple &TT, 1045 StringRef GPU, StringRef FS); 1046 hasBFE()1047 bool hasBFE() const { 1048 return (getGeneration() >= EVERGREEN); 1049 } 1050 hasBFI()1051 bool hasBFI() const { 1052 return (getGeneration() >= EVERGREEN); 1053 } 1054 hasBCNT(unsigned Size)1055 bool hasBCNT(unsigned Size) const { 1056 if (Size == 32) 1057 return (getGeneration() >= EVERGREEN); 1058 1059 return false; 1060 } 1061 hasBORROW()1062 bool hasBORROW() const { 1063 return (getGeneration() >= EVERGREEN); 1064 } 1065 hasCARRY()1066 bool hasCARRY() const { 1067 return (getGeneration() >= EVERGREEN); 1068 } 1069 hasCaymanISA()1070 bool hasCaymanISA() const { 1071 return CaymanISA; 1072 } 1073 hasFFBL()1074 bool hasFFBL() const { 1075 return (getGeneration() >= EVERGREEN); 1076 } 1077 hasFFBH()1078 bool hasFFBH() const { 1079 return (getGeneration() >= EVERGREEN); 1080 } 1081 hasFMA()1082 bool hasFMA() const { return FMA; } 1083 hasCFAluBug()1084 bool hasCFAluBug() const { return CFALUBug; } 1085 hasVertexCache()1086 bool hasVertexCache() const { return HasVertexCache; } 1087 getTexVTXClauseSize()1088 short getTexVTXClauseSize() const { return TexVTXClauseSize; } 1089 enableMachineScheduler()1090 bool enableMachineScheduler() const override { 1091 return true; 1092 } 1093 enableSubRegLiveness()1094 bool enableSubRegLiveness() const override { 1095 return true; 1096 } 1097 1098 /// \returns Maximum number of work groups per compute unit supported by the 1099 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1100 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override { 1101 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize); 1102 } 1103 1104 /// \returns Minimum flat work group size supported by the subtarget. getMinFlatWorkGroupSize()1105 unsigned getMinFlatWorkGroupSize() const override { 1106 return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this); 1107 } 1108 1109 /// \returns Maximum flat work group size supported by the subtarget. getMaxFlatWorkGroupSize()1110 unsigned getMaxFlatWorkGroupSize() const override { 1111 return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this); 1112 } 1113 1114 /// \returns Maximum number of waves per execution unit supported by the 1115 /// subtarget and limited by given \p FlatWorkGroupSize. getMaxWavesPerEU(unsigned FlatWorkGroupSize)1116 unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override { 1117 return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize); 1118 } 1119 1120 /// \returns Minimum number of waves per execution unit supported by the 1121 /// subtarget. getMinWavesPerEU()1122 unsigned getMinWavesPerEU() const override { 1123 return AMDGPU::IsaInfo::getMinWavesPerEU(this); 1124 } 1125 }; 1126 1127 } // end namespace llvm 1128 1129 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H 1130