1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 namespace llvm {
25 
26 class MCInst;
27 class MCInstrInfo;
28 
29 } // namespace llvm
30 
31 #define GET_SUBTARGETINFO_HEADER
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 namespace llvm {
35 
36 class GCNTargetMachine;
37 
38 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
39                            public AMDGPUSubtarget {
40 
41   using AMDGPUSubtarget::getMaxWavesPerEU;
42 
43 public:
44   // Following 2 enums are documented at:
45   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
46   enum class TrapHandlerAbi {
47     NONE   = 0x00,
48     AMDHSA = 0x01,
49   };
50 
51   enum class TrapID {
52     LLVMAMDHSATrap      = 0x02,
53     LLVMAMDHSADebugTrap = 0x03,
54   };
55 
56 private:
57   /// GlobalISel related APIs.
58   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
59   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
60   std::unique_ptr<InstructionSelector> InstSelector;
61   std::unique_ptr<LegalizerInfo> Legalizer;
62   std::unique_ptr<RegisterBankInfo> RegBankInfo;
63 
64 protected:
65   // Basic subtarget description.
66   Triple TargetTriple;
67   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
68   unsigned Gen;
69   InstrItineraryData InstrItins;
70   int LDSBankCount;
71   unsigned MaxPrivateElementSize;
72 
73   // Possibly statically set by tablegen, but may want to be overridden.
74   bool FastFMAF32;
75   bool FastDenormalF32;
76   bool HalfRate64Ops;
77   bool FullRate64Ops;
78 
79   // Dynamically set bits that enable features.
80   bool FlatForGlobal;
81   bool AutoWaitcntBeforeBarrier;
82   bool UnalignedScratchAccess;
83   bool UnalignedAccessMode;
84   bool HasApertureRegs;
85   bool SupportsXNACK;
86 
87   // This should not be used directly. 'TargetID' tracks the dynamic settings
88   // for XNACK.
89   bool EnableXNACK;
90 
91   bool EnableTgSplit;
92   bool EnableCuMode;
93   bool TrapHandler;
94 
95   // Used as options.
96   bool EnableLoadStoreOpt;
97   bool EnableUnsafeDSOffsetFolding;
98   bool EnableSIScheduler;
99   bool EnableDS128;
100   bool EnablePRTStrictNull;
101   bool DumpCode;
102 
103   // Subtarget statically properties set by tablegen
104   bool FP64;
105   bool FMA;
106   bool MIMG_R128;
107   bool IsGCN;
108   bool CIInsts;
109   bool GFX8Insts;
110   bool GFX9Insts;
111   bool GFX90AInsts;
112   bool GFX10Insts;
113   bool GFX10_3Insts;
114   bool GFX7GFX8GFX9Insts;
115   bool SGPRInitBug;
116   bool NegativeScratchOffsetBug;
117   bool NegativeUnalignedScratchOffsetBug;
118   bool HasSMemRealTime;
119   bool HasIntClamp;
120   bool HasFmaMixInsts;
121   bool HasMovrel;
122   bool HasVGPRIndexMode;
123   bool HasScalarStores;
124   bool HasScalarAtomics;
125   bool HasSDWAOmod;
126   bool HasSDWAScalar;
127   bool HasSDWASdst;
128   bool HasSDWAMac;
129   bool HasSDWAOutModsVOPC;
130   bool HasDPP;
131   bool HasDPP8;
132   bool Has64BitDPP;
133   bool HasPackedFP32Ops;
134   bool HasExtendedImageInsts;
135   bool HasR128A16;
136   bool HasGFX10A16;
137   bool HasG16;
138   bool HasNSAEncoding;
139   bool GFX10_BEncoding;
140   bool HasDLInsts;
141   bool HasDot1Insts;
142   bool HasDot2Insts;
143   bool HasDot3Insts;
144   bool HasDot4Insts;
145   bool HasDot5Insts;
146   bool HasDot6Insts;
147   bool HasDot7Insts;
148   bool HasMAIInsts;
149   bool HasPkFmacF16Inst;
150   bool HasAtomicFaddInsts;
151   bool SupportsSRAMECC;
152 
153   // This should not be used directly. 'TargetID' tracks the dynamic settings
154   // for SRAMECC.
155   bool EnableSRAMECC;
156 
157   bool HasNoSdstCMPX;
158   bool HasVscnt;
159   bool HasGetWaveIdInst;
160   bool HasSMemTimeInst;
161   bool HasShaderCyclesRegister;
162   bool HasRegisterBanking;
163   bool HasVOP3Literal;
164   bool HasNoDataDepHazard;
165   bool FlatAddressSpace;
166   bool FlatInstOffsets;
167   bool FlatGlobalInsts;
168   bool FlatScratchInsts;
169   bool ScalarFlatScratchInsts;
170   bool HasArchitectedFlatScratch;
171   bool AddNoCarryInsts;
172   bool HasUnpackedD16VMem;
173   bool R600ALUInst;
174   bool CaymanISA;
175   bool CFALUBug;
176   bool LDSMisalignedBug;
177   bool HasMFMAInlineLiteralBug;
178   bool HasVertexCache;
179   short TexVTXClauseSize;
180   bool UnalignedBufferAccess;
181   bool UnalignedDSAccess;
182   bool HasPackedTID;
183   bool ScalarizeGlobal;
184 
185   bool HasVcmpxPermlaneHazard;
186   bool HasVMEMtoScalarWriteHazard;
187   bool HasSMEMtoVectorWriteHazard;
188   bool HasInstFwdPrefetchBug;
189   bool HasVcmpxExecWARHazard;
190   bool HasLdsBranchVmemWARHazard;
191   bool HasNSAtoVMEMBug;
192   bool HasNSAClauseBug;
193   bool HasOffset3fBug;
194   bool HasFlatSegmentOffsetBug;
195   bool HasImageStoreD16Bug;
196   bool HasImageGather4D16Bug;
197 
198   // Dummy feature to use for assembler in tablegen.
199   bool FeatureDisable;
200 
201   SelectionDAGTargetInfo TSInfo;
202 private:
203   SIInstrInfo InstrInfo;
204   SITargetLowering TLInfo;
205   SIFrameLowering FrameLowering;
206 
207 public:
208   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
209   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
210 
211   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
212                const GCNTargetMachine &TM);
213   ~GCNSubtarget() override;
214 
215   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
216                                                    StringRef GPU, StringRef FS);
217 
getInstrInfo()218   const SIInstrInfo *getInstrInfo() const override {
219     return &InstrInfo;
220   }
221 
getFrameLowering()222   const SIFrameLowering *getFrameLowering() const override {
223     return &FrameLowering;
224   }
225 
getTargetLowering()226   const SITargetLowering *getTargetLowering() const override {
227     return &TLInfo;
228   }
229 
getRegisterInfo()230   const SIRegisterInfo *getRegisterInfo() const override {
231     return &InstrInfo.getRegisterInfo();
232   }
233 
getCallLowering()234   const CallLowering *getCallLowering() const override {
235     return CallLoweringInfo.get();
236   }
237 
getInlineAsmLowering()238   const InlineAsmLowering *getInlineAsmLowering() const override {
239     return InlineAsmLoweringInfo.get();
240   }
241 
getInstructionSelector()242   InstructionSelector *getInstructionSelector() const override {
243     return InstSelector.get();
244   }
245 
getLegalizerInfo()246   const LegalizerInfo *getLegalizerInfo() const override {
247     return Legalizer.get();
248   }
249 
getRegBankInfo()250   const RegisterBankInfo *getRegBankInfo() const override {
251     return RegBankInfo.get();
252   }
253 
getTargetID()254   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
255     return TargetID;
256   }
257 
258   // Nothing implemented, just prevent crashes on use.
getSelectionDAGInfo()259   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
260     return &TSInfo;
261   }
262 
getInstrItineraryData()263   const InstrItineraryData *getInstrItineraryData() const override {
264     return &InstrItins;
265   }
266 
267   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
268 
getGeneration()269   Generation getGeneration() const {
270     return (Generation)Gen;
271   }
272 
273   /// Return the number of high bits known to be zero fror a frame index.
getKnownHighZeroBitsForFrameIndex()274   unsigned getKnownHighZeroBitsForFrameIndex() const {
275     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
276   }
277 
getLDSBankCount()278   int getLDSBankCount() const {
279     return LDSBankCount;
280   }
281 
282   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
283     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
284   }
285 
286   unsigned getConstantBusLimit(unsigned Opcode) const;
287 
hasIntClamp()288   bool hasIntClamp() const {
289     return HasIntClamp;
290   }
291 
hasFP64()292   bool hasFP64() const {
293     return FP64;
294   }
295 
hasMIMG_R128()296   bool hasMIMG_R128() const {
297     return MIMG_R128;
298   }
299 
hasHWFP64()300   bool hasHWFP64() const {
301     return FP64;
302   }
303 
hasFastFMAF32()304   bool hasFastFMAF32() const {
305     return FastFMAF32;
306   }
307 
hasHalfRate64Ops()308   bool hasHalfRate64Ops() const {
309     return HalfRate64Ops;
310   }
311 
hasFullRate64Ops()312   bool hasFullRate64Ops() const {
313     return FullRate64Ops;
314   }
315 
hasAddr64()316   bool hasAddr64() const {
317     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
318   }
319 
hasFlat()320   bool hasFlat() const {
321     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
322   }
323 
324   // Return true if the target only has the reverse operand versions of VALU
325   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
hasOnlyRevVALUShifts()326   bool hasOnlyRevVALUShifts() const {
327     return getGeneration() >= VOLCANIC_ISLANDS;
328   }
329 
hasFractBug()330   bool hasFractBug() const {
331     return getGeneration() == SOUTHERN_ISLANDS;
332   }
333 
hasBFE()334   bool hasBFE() const {
335     return true;
336   }
337 
hasBFI()338   bool hasBFI() const {
339     return true;
340   }
341 
hasBFM()342   bool hasBFM() const {
343     return hasBFE();
344   }
345 
hasBCNT(unsigned Size)346   bool hasBCNT(unsigned Size) const {
347     return true;
348   }
349 
hasFFBL()350   bool hasFFBL() const {
351     return true;
352   }
353 
hasFFBH()354   bool hasFFBH() const {
355     return true;
356   }
357 
hasMed3_16()358   bool hasMed3_16() const {
359     return getGeneration() >= AMDGPUSubtarget::GFX9;
360   }
361 
hasMin3Max3_16()362   bool hasMin3Max3_16() const {
363     return getGeneration() >= AMDGPUSubtarget::GFX9;
364   }
365 
hasFmaMixInsts()366   bool hasFmaMixInsts() const {
367     return HasFmaMixInsts;
368   }
369 
hasCARRY()370   bool hasCARRY() const {
371     return true;
372   }
373 
hasFMA()374   bool hasFMA() const {
375     return FMA;
376   }
377 
hasSwap()378   bool hasSwap() const {
379     return GFX9Insts;
380   }
381 
hasScalarPackInsts()382   bool hasScalarPackInsts() const {
383     return GFX9Insts;
384   }
385 
hasScalarMulHiInsts()386   bool hasScalarMulHiInsts() const {
387     return GFX9Insts;
388   }
389 
getTrapHandlerAbi()390   TrapHandlerAbi getTrapHandlerAbi() const {
391     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
392   }
393 
supportsGetDoorbellID()394   bool supportsGetDoorbellID() const {
395     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
396     return getGeneration() >= GFX9;
397   }
398 
399   /// True if the offset field of DS instructions works as expected. On SI, the
400   /// offset uses a 16-bit adder and does not always wrap properly.
hasUsableDSOffset()401   bool hasUsableDSOffset() const {
402     return getGeneration() >= SEA_ISLANDS;
403   }
404 
unsafeDSOffsetFoldingEnabled()405   bool unsafeDSOffsetFoldingEnabled() const {
406     return EnableUnsafeDSOffsetFolding;
407   }
408 
409   /// Condition output from div_scale is usable.
hasUsableDivScaleConditionOutput()410   bool hasUsableDivScaleConditionOutput() const {
411     return getGeneration() != SOUTHERN_ISLANDS;
412   }
413 
414   /// Extra wait hazard is needed in some cases before
415   /// s_cbranch_vccnz/s_cbranch_vccz.
hasReadVCCZBug()416   bool hasReadVCCZBug() const {
417     return getGeneration() <= SEA_ISLANDS;
418   }
419 
420   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
partialVCCWritesUpdateVCCZ()421   bool partialVCCWritesUpdateVCCZ() const {
422     return getGeneration() >= GFX10;
423   }
424 
425   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
426   /// was written by a VALU instruction.
hasSMRDReadVALUDefHazard()427   bool hasSMRDReadVALUDefHazard() const {
428     return getGeneration() == SOUTHERN_ISLANDS;
429   }
430 
431   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
432   /// SGPR was written by a VALU Instruction.
hasVMEMReadSGPRVALUDefHazard()433   bool hasVMEMReadSGPRVALUDefHazard() const {
434     return getGeneration() >= VOLCANIC_ISLANDS;
435   }
436 
hasRFEHazards()437   bool hasRFEHazards() const {
438     return getGeneration() >= VOLCANIC_ISLANDS;
439   }
440 
441   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
getSetRegWaitStates()442   unsigned getSetRegWaitStates() const {
443     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
444   }
445 
dumpCode()446   bool dumpCode() const {
447     return DumpCode;
448   }
449 
450   /// Return the amount of LDS that can be used that will not restrict the
451   /// occupancy lower than WaveCount.
452   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
453                                            const Function &) const;
454 
supportsMinMaxDenormModes()455   bool supportsMinMaxDenormModes() const {
456     return getGeneration() >= AMDGPUSubtarget::GFX9;
457   }
458 
459   /// \returns If target supports S_DENORM_MODE.
hasDenormModeInst()460   bool hasDenormModeInst() const {
461     return getGeneration() >= AMDGPUSubtarget::GFX10;
462   }
463 
useFlatForGlobal()464   bool useFlatForGlobal() const {
465     return FlatForGlobal;
466   }
467 
468   /// \returns If target supports ds_read/write_b128 and user enables generation
469   /// of ds_read/write_b128.
useDS128()470   bool useDS128() const {
471     return CIInsts && EnableDS128;
472   }
473 
474   /// \return If target supports ds_read/write_b96/128.
hasDS96AndDS128()475   bool hasDS96AndDS128() const {
476     return CIInsts;
477   }
478 
479   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
haveRoundOpsF64()480   bool haveRoundOpsF64() const {
481     return CIInsts;
482   }
483 
484   /// \returns If MUBUF instructions always perform range checking, even for
485   /// buffer resources used for private memory access.
privateMemoryResourceIsRangeChecked()486   bool privateMemoryResourceIsRangeChecked() const {
487     return getGeneration() < AMDGPUSubtarget::GFX9;
488   }
489 
490   /// \returns If target requires PRT Struct NULL support (zero result registers
491   /// for sparse texture support).
usePRTStrictNull()492   bool usePRTStrictNull() const {
493     return EnablePRTStrictNull;
494   }
495 
hasAutoWaitcntBeforeBarrier()496   bool hasAutoWaitcntBeforeBarrier() const {
497     return AutoWaitcntBeforeBarrier;
498   }
499 
hasUnalignedBufferAccess()500   bool hasUnalignedBufferAccess() const {
501     return UnalignedBufferAccess;
502   }
503 
hasUnalignedBufferAccessEnabled()504   bool hasUnalignedBufferAccessEnabled() const {
505     return UnalignedBufferAccess && UnalignedAccessMode;
506   }
507 
hasUnalignedDSAccess()508   bool hasUnalignedDSAccess() const {
509     return UnalignedDSAccess;
510   }
511 
hasUnalignedDSAccessEnabled()512   bool hasUnalignedDSAccessEnabled() const {
513     return UnalignedDSAccess && UnalignedAccessMode;
514   }
515 
hasUnalignedScratchAccess()516   bool hasUnalignedScratchAccess() const {
517     return UnalignedScratchAccess;
518   }
519 
hasUnalignedAccessMode()520   bool hasUnalignedAccessMode() const {
521     return UnalignedAccessMode;
522   }
523 
hasApertureRegs()524   bool hasApertureRegs() const {
525     return HasApertureRegs;
526   }
527 
isTrapHandlerEnabled()528   bool isTrapHandlerEnabled() const {
529     return TrapHandler;
530   }
531 
isXNACKEnabled()532   bool isXNACKEnabled() const {
533     return TargetID.isXnackOnOrAny();
534   }
535 
isTgSplitEnabled()536   bool isTgSplitEnabled() const {
537     return EnableTgSplit;
538   }
539 
isCuModeEnabled()540   bool isCuModeEnabled() const {
541     return EnableCuMode;
542   }
543 
hasFlatAddressSpace()544   bool hasFlatAddressSpace() const {
545     return FlatAddressSpace;
546   }
547 
hasFlatScrRegister()548   bool hasFlatScrRegister() const {
549     return hasFlatAddressSpace();
550   }
551 
hasFlatInstOffsets()552   bool hasFlatInstOffsets() const {
553     return FlatInstOffsets;
554   }
555 
hasFlatGlobalInsts()556   bool hasFlatGlobalInsts() const {
557     return FlatGlobalInsts;
558   }
559 
hasFlatScratchInsts()560   bool hasFlatScratchInsts() const {
561     return FlatScratchInsts;
562   }
563 
564   // Check if target supports ST addressing mode with FLAT scratch instructions.
565   // The ST addressing mode means no registers are used, either VGPR or SGPR,
566   // but only immediate offset is swizzled and added to the FLAT scratch base.
hasFlatScratchSTMode()567   bool hasFlatScratchSTMode() const {
568     return hasFlatScratchInsts() && hasGFX10_3Insts();
569   }
570 
hasScalarFlatScratchInsts()571   bool hasScalarFlatScratchInsts() const {
572     return ScalarFlatScratchInsts;
573   }
574 
hasGlobalAddTidInsts()575   bool hasGlobalAddTidInsts() const {
576     return GFX10_BEncoding;
577   }
578 
hasAtomicCSub()579   bool hasAtomicCSub() const {
580     return GFX10_BEncoding;
581   }
582 
hasMultiDwordFlatScratchAddressing()583   bool hasMultiDwordFlatScratchAddressing() const {
584     return getGeneration() >= GFX9;
585   }
586 
hasFlatSegmentOffsetBug()587   bool hasFlatSegmentOffsetBug() const {
588     return HasFlatSegmentOffsetBug;
589   }
590 
hasFlatLgkmVMemCountInOrder()591   bool hasFlatLgkmVMemCountInOrder() const {
592     return getGeneration() > GFX9;
593   }
594 
hasD16LoadStore()595   bool hasD16LoadStore() const {
596     return getGeneration() >= GFX9;
597   }
598 
d16PreservesUnusedBits()599   bool d16PreservesUnusedBits() const {
600     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
601   }
602 
hasD16Images()603   bool hasD16Images() const {
604     return getGeneration() >= VOLCANIC_ISLANDS;
605   }
606 
607   /// Return if most LDS instructions have an m0 use that require m0 to be
608   /// iniitalized.
ldsRequiresM0Init()609   bool ldsRequiresM0Init() const {
610     return getGeneration() < GFX9;
611   }
612 
613   // True if the hardware rewinds and replays GWS operations if a wave is
614   // preempted.
615   //
616   // If this is false, a GWS operation requires testing if a nack set the
617   // MEM_VIOL bit, and repeating if so.
hasGWSAutoReplay()618   bool hasGWSAutoReplay() const {
619     return getGeneration() >= GFX9;
620   }
621 
622   /// \returns if target has ds_gws_sema_release_all instruction.
hasGWSSemaReleaseAll()623   bool hasGWSSemaReleaseAll() const {
624     return CIInsts;
625   }
626 
627   /// \returns true if the target has integer add/sub instructions that do not
628   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
629   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
630   /// for saturation.
hasAddNoCarry()631   bool hasAddNoCarry() const {
632     return AddNoCarryInsts;
633   }
634 
hasUnpackedD16VMem()635   bool hasUnpackedD16VMem() const {
636     return HasUnpackedD16VMem;
637   }
638 
639   // Covers VS/PS/CS graphics shaders
isMesaGfxShader(const Function & F)640   bool isMesaGfxShader(const Function &F) const {
641     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
642   }
643 
hasMad64_32()644   bool hasMad64_32() const {
645     return getGeneration() >= SEA_ISLANDS;
646   }
647 
hasSDWAOmod()648   bool hasSDWAOmod() const {
649     return HasSDWAOmod;
650   }
651 
hasSDWAScalar()652   bool hasSDWAScalar() const {
653     return HasSDWAScalar;
654   }
655 
hasSDWASdst()656   bool hasSDWASdst() const {
657     return HasSDWASdst;
658   }
659 
hasSDWAMac()660   bool hasSDWAMac() const {
661     return HasSDWAMac;
662   }
663 
hasSDWAOutModsVOPC()664   bool hasSDWAOutModsVOPC() const {
665     return HasSDWAOutModsVOPC;
666   }
667 
hasDLInsts()668   bool hasDLInsts() const {
669     return HasDLInsts;
670   }
671 
hasDot1Insts()672   bool hasDot1Insts() const {
673     return HasDot1Insts;
674   }
675 
hasDot2Insts()676   bool hasDot2Insts() const {
677     return HasDot2Insts;
678   }
679 
hasDot3Insts()680   bool hasDot3Insts() const {
681     return HasDot3Insts;
682   }
683 
hasDot4Insts()684   bool hasDot4Insts() const {
685     return HasDot4Insts;
686   }
687 
hasDot5Insts()688   bool hasDot5Insts() const {
689     return HasDot5Insts;
690   }
691 
hasDot6Insts()692   bool hasDot6Insts() const {
693     return HasDot6Insts;
694   }
695 
hasDot7Insts()696   bool hasDot7Insts() const {
697     return HasDot7Insts;
698   }
699 
hasMAIInsts()700   bool hasMAIInsts() const {
701     return HasMAIInsts;
702   }
703 
hasPkFmacF16Inst()704   bool hasPkFmacF16Inst() const {
705     return HasPkFmacF16Inst;
706   }
707 
hasAtomicFaddInsts()708   bool hasAtomicFaddInsts() const {
709     return HasAtomicFaddInsts;
710   }
711 
hasNoSdstCMPX()712   bool hasNoSdstCMPX() const {
713     return HasNoSdstCMPX;
714   }
715 
hasVscnt()716   bool hasVscnt() const {
717     return HasVscnt;
718   }
719 
hasGetWaveIdInst()720   bool hasGetWaveIdInst() const {
721     return HasGetWaveIdInst;
722   }
723 
hasSMemTimeInst()724   bool hasSMemTimeInst() const {
725     return HasSMemTimeInst;
726   }
727 
hasShaderCyclesRegister()728   bool hasShaderCyclesRegister() const {
729     return HasShaderCyclesRegister;
730   }
731 
hasRegisterBanking()732   bool hasRegisterBanking() const {
733     return HasRegisterBanking;
734   }
735 
hasVOP3Literal()736   bool hasVOP3Literal() const {
737     return HasVOP3Literal;
738   }
739 
hasNoDataDepHazard()740   bool hasNoDataDepHazard() const {
741     return HasNoDataDepHazard;
742   }
743 
vmemWriteNeedsExpWaitcnt()744   bool vmemWriteNeedsExpWaitcnt() const {
745     return getGeneration() < SEA_ISLANDS;
746   }
747 
748   // Scratch is allocated in 256 dword per wave blocks for the entire
749   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
750   // is 4-byte aligned.
751   //
752   // Only 4-byte alignment is really needed to access anything. Transformations
753   // on the pointer value itself may rely on the alignment / known low bits of
754   // the pointer. Set this to something above the minimum to avoid needing
755   // dynamic realignment in common cases.
getStackAlignment()756   Align getStackAlignment() const { return Align(16); }
757 
enableMachineScheduler()758   bool enableMachineScheduler() const override {
759     return true;
760   }
761 
762   bool useAA() const override;
763 
enableSubRegLiveness()764   bool enableSubRegLiveness() const override {
765     return true;
766   }
767 
setScalarizeGlobalBehavior(bool b)768   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
getScalarizeGlobalBehavior()769   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
770 
771   // static wrappers
772   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
773 
774   // XXX - Why is this here if it isn't in the default pass set?
enableEarlyIfConversion()775   bool enableEarlyIfConversion() const override {
776     return true;
777   }
778 
779   bool enableFlatScratch() const;
780 
781   void overrideSchedPolicy(MachineSchedPolicy &Policy,
782                            unsigned NumRegionInstrs) const override;
783 
getMaxNumUserSGPRs()784   unsigned getMaxNumUserSGPRs() const {
785     return 16;
786   }
787 
hasSMemRealTime()788   bool hasSMemRealTime() const {
789     return HasSMemRealTime;
790   }
791 
hasMovrel()792   bool hasMovrel() const {
793     return HasMovrel;
794   }
795 
hasVGPRIndexMode()796   bool hasVGPRIndexMode() const {
797     return HasVGPRIndexMode;
798   }
799 
800   bool useVGPRIndexMode() const;
801 
hasScalarCompareEq64()802   bool hasScalarCompareEq64() const {
803     return getGeneration() >= VOLCANIC_ISLANDS;
804   }
805 
hasScalarStores()806   bool hasScalarStores() const {
807     return HasScalarStores;
808   }
809 
hasScalarAtomics()810   bool hasScalarAtomics() const {
811     return HasScalarAtomics;
812   }
813 
hasLDSFPAtomics()814   bool hasLDSFPAtomics() const {
815     return GFX8Insts;
816   }
817 
818   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
hasPermLaneX16()819   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
820 
hasDPP()821   bool hasDPP() const {
822     return HasDPP;
823   }
824 
hasDPPBroadcasts()825   bool hasDPPBroadcasts() const {
826     return HasDPP && getGeneration() < GFX10;
827   }
828 
hasDPPWavefrontShifts()829   bool hasDPPWavefrontShifts() const {
830     return HasDPP && getGeneration() < GFX10;
831   }
832 
hasDPP8()833   bool hasDPP8() const {
834     return HasDPP8;
835   }
836 
has64BitDPP()837   bool has64BitDPP() const {
838     return Has64BitDPP;
839   }
840 
hasPackedFP32Ops()841   bool hasPackedFP32Ops() const {
842     return HasPackedFP32Ops;
843   }
844 
hasFmaakFmamkF32Insts()845   bool hasFmaakFmamkF32Insts() const {
846     return getGeneration() >= GFX10;
847   }
848 
hasExtendedImageInsts()849   bool hasExtendedImageInsts() const {
850     return HasExtendedImageInsts;
851   }
852 
hasR128A16()853   bool hasR128A16() const {
854     return HasR128A16;
855   }
856 
hasGFX10A16()857   bool hasGFX10A16() const {
858     return HasGFX10A16;
859   }
860 
hasA16()861   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
862 
hasG16()863   bool hasG16() const { return HasG16; }
864 
hasOffset3fBug()865   bool hasOffset3fBug() const {
866     return HasOffset3fBug;
867   }
868 
hasImageStoreD16Bug()869   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
870 
hasImageGather4D16Bug()871   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
872 
hasNSAEncoding()873   bool hasNSAEncoding() const { return HasNSAEncoding; }
874 
hasGFX10_BEncoding()875   bool hasGFX10_BEncoding() const {
876     return GFX10_BEncoding;
877   }
878 
hasGFX10_3Insts()879   bool hasGFX10_3Insts() const {
880     return GFX10_3Insts;
881   }
882 
883   bool hasMadF16() const;
884 
enableSIScheduler()885   bool enableSIScheduler() const {
886     return EnableSIScheduler;
887   }
888 
loadStoreOptEnabled()889   bool loadStoreOptEnabled() const {
890     return EnableLoadStoreOpt;
891   }
892 
hasSGPRInitBug()893   bool hasSGPRInitBug() const {
894     return SGPRInitBug;
895   }
896 
hasNegativeScratchOffsetBug()897   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
898 
hasNegativeUnalignedScratchOffsetBug()899   bool hasNegativeUnalignedScratchOffsetBug() const {
900     return NegativeUnalignedScratchOffsetBug;
901   }
902 
hasMFMAInlineLiteralBug()903   bool hasMFMAInlineLiteralBug() const {
904     return HasMFMAInlineLiteralBug;
905   }
906 
has12DWordStoreHazard()907   bool has12DWordStoreHazard() const {
908     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
909   }
910 
911   // \returns true if the subtarget supports DWORDX3 load/store instructions.
hasDwordx3LoadStores()912   bool hasDwordx3LoadStores() const {
913     return CIInsts;
914   }
915 
hasReadM0MovRelInterpHazard()916   bool hasReadM0MovRelInterpHazard() const {
917     return getGeneration() == AMDGPUSubtarget::GFX9;
918   }
919 
hasReadM0SendMsgHazard()920   bool hasReadM0SendMsgHazard() const {
921     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
922            getGeneration() <= AMDGPUSubtarget::GFX9;
923   }
924 
hasVcmpxPermlaneHazard()925   bool hasVcmpxPermlaneHazard() const {
926     return HasVcmpxPermlaneHazard;
927   }
928 
hasVMEMtoScalarWriteHazard()929   bool hasVMEMtoScalarWriteHazard() const {
930     return HasVMEMtoScalarWriteHazard;
931   }
932 
hasSMEMtoVectorWriteHazard()933   bool hasSMEMtoVectorWriteHazard() const {
934     return HasSMEMtoVectorWriteHazard;
935   }
936 
hasLDSMisalignedBug()937   bool hasLDSMisalignedBug() const {
938     return LDSMisalignedBug && !EnableCuMode;
939   }
940 
hasInstFwdPrefetchBug()941   bool hasInstFwdPrefetchBug() const {
942     return HasInstFwdPrefetchBug;
943   }
944 
hasVcmpxExecWARHazard()945   bool hasVcmpxExecWARHazard() const {
946     return HasVcmpxExecWARHazard;
947   }
948 
hasLdsBranchVmemWARHazard()949   bool hasLdsBranchVmemWARHazard() const {
950     return HasLdsBranchVmemWARHazard;
951   }
952 
hasNSAtoVMEMBug()953   bool hasNSAtoVMEMBug() const {
954     return HasNSAtoVMEMBug;
955   }
956 
hasNSAClauseBug()957   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
958 
hasHardClauses()959   bool hasHardClauses() const { return getGeneration() >= GFX10; }
960 
hasGFX90AInsts()961   bool hasGFX90AInsts() const { return GFX90AInsts; }
962 
963   /// Return if operations acting on VGPR tuples require even alignment.
needsAlignedVGPRs()964   bool needsAlignedVGPRs() const { return GFX90AInsts; }
965 
hasPackedTID()966   bool hasPackedTID() const { return HasPackedTID; }
967 
968   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
969   /// SGPRs
970   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
971 
972   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
973   /// VGPRs
974   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
975 
976   /// Return occupancy for the given function. Used LDS and a number of
977   /// registers if provided.
978   /// Note, occupancy can be affected by the scratch allocation as well, but
979   /// we do not have enough information to compute it.
980   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
981                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
982 
983   /// \returns true if the flat_scratch register should be initialized with the
984   /// pointer to the wave's scratch memory rather than a size and offset.
flatScratchIsPointer()985   bool flatScratchIsPointer() const {
986     return getGeneration() >= AMDGPUSubtarget::GFX9;
987   }
988 
989   /// \returns true if the flat_scratch register is initialized by the HW.
990   /// In this case it is readonly.
flatScratchIsArchitected()991   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
992 
993   /// \returns true if the machine has merged shaders in which s0-s7 are
994   /// reserved by the hardware and user SGPRs start at s8
hasMergedShaders()995   bool hasMergedShaders() const {
996     return getGeneration() >= GFX9;
997   }
998 
999   /// \returns SGPR allocation granularity supported by the subtarget.
getSGPRAllocGranule()1000   unsigned getSGPRAllocGranule() const {
1001     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1002   }
1003 
1004   /// \returns SGPR encoding granularity supported by the subtarget.
getSGPREncodingGranule()1005   unsigned getSGPREncodingGranule() const {
1006     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1007   }
1008 
1009   /// \returns Total number of SGPRs supported by the subtarget.
getTotalNumSGPRs()1010   unsigned getTotalNumSGPRs() const {
1011     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1012   }
1013 
1014   /// \returns Addressable number of SGPRs supported by the subtarget.
getAddressableNumSGPRs()1015   unsigned getAddressableNumSGPRs() const {
1016     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1017   }
1018 
1019   /// \returns Minimum number of SGPRs that meets the given number of waves per
1020   /// execution unit requirement supported by the subtarget.
getMinNumSGPRs(unsigned WavesPerEU)1021   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1022     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1023   }
1024 
1025   /// \returns Maximum number of SGPRs that meets the given number of waves per
1026   /// execution unit requirement supported by the subtarget.
getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1027   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1028     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1029   }
1030 
1031   /// \returns Reserved number of SGPRs for given function \p MF.
1032   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1033 
1034   /// \returns Maximum number of SGPRs that meets number of waves per execution
1035   /// unit requirement for function \p MF, or number of SGPRs explicitly
1036   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1037   ///
1038   /// \returns Value that meets number of waves per execution unit requirement
1039   /// if explicitly requested value cannot be converted to integer, violates
1040   /// subtarget's specifications, or does not meet number of waves per execution
1041   /// unit requirement.
1042   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1043 
1044   /// \returns VGPR allocation granularity supported by the subtarget.
getVGPRAllocGranule()1045   unsigned getVGPRAllocGranule() const {
1046     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1047   }
1048 
1049   /// \returns VGPR encoding granularity supported by the subtarget.
getVGPREncodingGranule()1050   unsigned getVGPREncodingGranule() const {
1051     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1052   }
1053 
1054   /// \returns Total number of VGPRs supported by the subtarget.
getTotalNumVGPRs()1055   unsigned getTotalNumVGPRs() const {
1056     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1057   }
1058 
1059   /// \returns Addressable number of VGPRs supported by the subtarget.
getAddressableNumVGPRs()1060   unsigned getAddressableNumVGPRs() const {
1061     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1062   }
1063 
1064   /// \returns Minimum number of VGPRs that meets given number of waves per
1065   /// execution unit requirement supported by the subtarget.
getMinNumVGPRs(unsigned WavesPerEU)1066   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1067     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1068   }
1069 
1070   /// \returns Maximum number of VGPRs that meets given number of waves per
1071   /// execution unit requirement supported by the subtarget.
getMaxNumVGPRs(unsigned WavesPerEU)1072   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1073     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1074   }
1075 
1076   /// \returns Maximum number of VGPRs that meets number of waves per execution
1077   /// unit requirement for function \p MF, or number of VGPRs explicitly
1078   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1079   ///
1080   /// \returns Value that meets number of waves per execution unit requirement
1081   /// if explicitly requested value cannot be converted to integer, violates
1082   /// subtarget's specifications, or does not meet number of waves per execution
1083   /// unit requirement.
1084   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1085 
1086   void getPostRAMutations(
1087       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1088       const override;
1089 
isWave32()1090   bool isWave32() const {
1091     return getWavefrontSize() == 32;
1092   }
1093 
isWave64()1094   bool isWave64() const {
1095     return getWavefrontSize() == 64;
1096   }
1097 
getBoolRC()1098   const TargetRegisterClass *getBoolRC() const {
1099     return getRegisterInfo()->getBoolRC();
1100   }
1101 
1102   /// \returns Maximum number of work groups per compute unit supported by the
1103   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1104   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1105     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1106   }
1107 
1108   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()1109   unsigned getMinFlatWorkGroupSize() const override {
1110     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1111   }
1112 
1113   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()1114   unsigned getMaxFlatWorkGroupSize() const override {
1115     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1116   }
1117 
1118   /// \returns Number of waves per execution unit required to support the given
1119   /// \p FlatWorkGroupSize.
1120   unsigned
getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1121   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1122     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1123   }
1124 
1125   /// \returns Minimum number of waves per execution unit supported by the
1126   /// subtarget.
getMinWavesPerEU()1127   unsigned getMinWavesPerEU() const override {
1128     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1129   }
1130 
1131   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1132                              SDep &Dep) const override;
1133 };
1134 
1135 } // end namespace llvm
1136 
1137 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1138