1 //=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMD GCN specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16 
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIFrameLowering.h"
20 #include "SIISelLowering.h"
21 #include "SIInstrInfo.h"
22 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
23 
24 namespace llvm {
25 
26 class MCInst;
27 class MCInstrInfo;
28 
29 } // namespace llvm
30 
31 #define GET_SUBTARGETINFO_HEADER
32 #include "AMDGPUGenSubtargetInfo.inc"
33 
34 namespace llvm {
35 
36 class GCNTargetMachine;
37 
38 class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
39                            public AMDGPUSubtarget {
40 
41   using AMDGPUSubtarget::getMaxWavesPerEU;
42 
43 public:
44   // Following 2 enums are documented at:
45   //   - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
46   enum class TrapHandlerAbi {
47     NONE   = 0x00,
48     AMDHSA = 0x01,
49   };
50 
51   enum class TrapID {
52     LLVMAMDHSATrap      = 0x02,
53     LLVMAMDHSADebugTrap = 0x03,
54   };
55 
56 private:
57   /// GlobalISel related APIs.
58   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
59   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
60   std::unique_ptr<InstructionSelector> InstSelector;
61   std::unique_ptr<LegalizerInfo> Legalizer;
62   std::unique_ptr<RegisterBankInfo> RegBankInfo;
63 
64 protected:
65   // Basic subtarget description.
66   Triple TargetTriple;
67   AMDGPU::IsaInfo::AMDGPUTargetID TargetID;
68   unsigned Gen;
69   InstrItineraryData InstrItins;
70   int LDSBankCount;
71   unsigned MaxPrivateElementSize;
72 
73   // Possibly statically set by tablegen, but may want to be overridden.
74   bool FastFMAF32;
75   bool FastDenormalF32;
76   bool HalfRate64Ops;
77   bool FullRate64Ops;
78 
79   // Dynamically set bits that enable features.
80   bool FlatForGlobal;
81   bool AutoWaitcntBeforeBarrier;
82   bool UnalignedScratchAccess;
83   bool UnalignedAccessMode;
84   bool HasApertureRegs;
85   bool SupportsXNACK;
86 
87   // This should not be used directly. 'TargetID' tracks the dynamic settings
88   // for XNACK.
89   bool EnableXNACK;
90 
91   bool EnableTgSplit;
92   bool EnableCuMode;
93   bool TrapHandler;
94 
95   // Used as options.
96   bool EnableLoadStoreOpt;
97   bool EnableUnsafeDSOffsetFolding;
98   bool EnableSIScheduler;
99   bool EnableDS128;
100   bool EnablePRTStrictNull;
101   bool DumpCode;
102 
103   // Subtarget statically properties set by tablegen
104   bool FP64;
105   bool FMA;
106   bool MIMG_R128;
107   bool IsGCN;
108   bool CIInsts;
109   bool GFX8Insts;
110   bool GFX9Insts;
111   bool GFX90AInsts;
112   bool GFX10Insts;
113   bool GFX10_3Insts;
114   bool GFX7GFX8GFX9Insts;
115   bool SGPRInitBug;
116   bool NegativeScratchOffsetBug;
117   bool NegativeUnalignedScratchOffsetBug;
118   bool HasSMemRealTime;
119   bool HasIntClamp;
120   bool HasFmaMixInsts;
121   bool HasMovrel;
122   bool HasVGPRIndexMode;
123   bool HasScalarStores;
124   bool HasScalarAtomics;
125   bool HasSDWAOmod;
126   bool HasSDWAScalar;
127   bool HasSDWASdst;
128   bool HasSDWAMac;
129   bool HasSDWAOutModsVOPC;
130   bool HasDPP;
131   bool HasDPP8;
132   bool Has64BitDPP;
133   bool HasPackedFP32Ops;
134   bool HasExtendedImageInsts;
135   bool HasR128A16;
136   bool HasGFX10A16;
137   bool HasG16;
138   bool HasNSAEncoding;
139   unsigned NSAMaxSize;
140   bool GFX10_AEncoding;
141   bool GFX10_BEncoding;
142   bool HasDLInsts;
143   bool HasDot1Insts;
144   bool HasDot2Insts;
145   bool HasDot3Insts;
146   bool HasDot4Insts;
147   bool HasDot5Insts;
148   bool HasDot6Insts;
149   bool HasDot7Insts;
150   bool HasMAIInsts;
151   bool HasPkFmacF16Inst;
152   bool HasAtomicFaddInsts;
153   bool SupportsSRAMECC;
154 
155   // This should not be used directly. 'TargetID' tracks the dynamic settings
156   // for SRAMECC.
157   bool EnableSRAMECC;
158 
159   bool HasNoSdstCMPX;
160   bool HasVscnt;
161   bool HasGetWaveIdInst;
162   bool HasSMemTimeInst;
163   bool HasShaderCyclesRegister;
164   bool HasRegisterBanking;
165   bool HasVOP3Literal;
166   bool HasNoDataDepHazard;
167   bool FlatAddressSpace;
168   bool FlatInstOffsets;
169   bool FlatGlobalInsts;
170   bool FlatScratchInsts;
171   bool ScalarFlatScratchInsts;
172   bool HasArchitectedFlatScratch;
173   bool AddNoCarryInsts;
174   bool HasUnpackedD16VMem;
175   bool R600ALUInst;
176   bool CaymanISA;
177   bool CFALUBug;
178   bool LDSMisalignedBug;
179   bool HasMFMAInlineLiteralBug;
180   bool HasVertexCache;
181   short TexVTXClauseSize;
182   bool UnalignedBufferAccess;
183   bool UnalignedDSAccess;
184   bool HasPackedTID;
185   bool ScalarizeGlobal;
186 
187   bool HasVcmpxPermlaneHazard;
188   bool HasVMEMtoScalarWriteHazard;
189   bool HasSMEMtoVectorWriteHazard;
190   bool HasInstFwdPrefetchBug;
191   bool HasVcmpxExecWARHazard;
192   bool HasLdsBranchVmemWARHazard;
193   bool HasNSAtoVMEMBug;
194   bool HasNSAClauseBug;
195   bool HasOffset3fBug;
196   bool HasFlatSegmentOffsetBug;
197   bool HasImageStoreD16Bug;
198   bool HasImageGather4D16Bug;
199 
200   // Dummy feature to use for assembler in tablegen.
201   bool FeatureDisable;
202 
203   SelectionDAGTargetInfo TSInfo;
204 private:
205   SIInstrInfo InstrInfo;
206   SITargetLowering TLInfo;
207   SIFrameLowering FrameLowering;
208 
209 public:
210   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
211   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
212 
213   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
214                const GCNTargetMachine &TM);
215   ~GCNSubtarget() override;
216 
217   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
218                                                    StringRef GPU, StringRef FS);
219 
getInstrInfo()220   const SIInstrInfo *getInstrInfo() const override {
221     return &InstrInfo;
222   }
223 
getFrameLowering()224   const SIFrameLowering *getFrameLowering() const override {
225     return &FrameLowering;
226   }
227 
getTargetLowering()228   const SITargetLowering *getTargetLowering() const override {
229     return &TLInfo;
230   }
231 
getRegisterInfo()232   const SIRegisterInfo *getRegisterInfo() const override {
233     return &InstrInfo.getRegisterInfo();
234   }
235 
getCallLowering()236   const CallLowering *getCallLowering() const override {
237     return CallLoweringInfo.get();
238   }
239 
getInlineAsmLowering()240   const InlineAsmLowering *getInlineAsmLowering() const override {
241     return InlineAsmLoweringInfo.get();
242   }
243 
getInstructionSelector()244   InstructionSelector *getInstructionSelector() const override {
245     return InstSelector.get();
246   }
247 
getLegalizerInfo()248   const LegalizerInfo *getLegalizerInfo() const override {
249     return Legalizer.get();
250   }
251 
getRegBankInfo()252   const RegisterBankInfo *getRegBankInfo() const override {
253     return RegBankInfo.get();
254   }
255 
getTargetID()256   const AMDGPU::IsaInfo::AMDGPUTargetID &getTargetID() const {
257     return TargetID;
258   }
259 
260   // Nothing implemented, just prevent crashes on use.
getSelectionDAGInfo()261   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
262     return &TSInfo;
263   }
264 
getInstrItineraryData()265   const InstrItineraryData *getInstrItineraryData() const override {
266     return &InstrItins;
267   }
268 
269   void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS);
270 
getGeneration()271   Generation getGeneration() const {
272     return (Generation)Gen;
273   }
274 
275   /// Return the number of high bits known to be zero fror a frame index.
getKnownHighZeroBitsForFrameIndex()276   unsigned getKnownHighZeroBitsForFrameIndex() const {
277     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
278   }
279 
getLDSBankCount()280   int getLDSBankCount() const {
281     return LDSBankCount;
282   }
283 
284   unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
285     return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
286   }
287 
288   unsigned getConstantBusLimit(unsigned Opcode) const;
289 
290   /// Returns if the result of this instruction with a 16-bit result returned in
291   /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
292   /// the original value.
293   bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
294 
hasIntClamp()295   bool hasIntClamp() const {
296     return HasIntClamp;
297   }
298 
hasFP64()299   bool hasFP64() const {
300     return FP64;
301   }
302 
hasMIMG_R128()303   bool hasMIMG_R128() const {
304     return MIMG_R128;
305   }
306 
hasHWFP64()307   bool hasHWFP64() const {
308     return FP64;
309   }
310 
hasFastFMAF32()311   bool hasFastFMAF32() const {
312     return FastFMAF32;
313   }
314 
hasHalfRate64Ops()315   bool hasHalfRate64Ops() const {
316     return HalfRate64Ops;
317   }
318 
hasFullRate64Ops()319   bool hasFullRate64Ops() const {
320     return FullRate64Ops;
321   }
322 
hasAddr64()323   bool hasAddr64() const {
324     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
325   }
326 
hasFlat()327   bool hasFlat() const {
328     return (getGeneration() > AMDGPUSubtarget::SOUTHERN_ISLANDS);
329   }
330 
331   // Return true if the target only has the reverse operand versions of VALU
332   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
hasOnlyRevVALUShifts()333   bool hasOnlyRevVALUShifts() const {
334     return getGeneration() >= VOLCANIC_ISLANDS;
335   }
336 
hasFractBug()337   bool hasFractBug() const {
338     return getGeneration() == SOUTHERN_ISLANDS;
339   }
340 
hasBFE()341   bool hasBFE() const {
342     return true;
343   }
344 
hasBFI()345   bool hasBFI() const {
346     return true;
347   }
348 
hasBFM()349   bool hasBFM() const {
350     return hasBFE();
351   }
352 
hasBCNT(unsigned Size)353   bool hasBCNT(unsigned Size) const {
354     return true;
355   }
356 
hasFFBL()357   bool hasFFBL() const {
358     return true;
359   }
360 
hasFFBH()361   bool hasFFBH() const {
362     return true;
363   }
364 
hasMed3_16()365   bool hasMed3_16() const {
366     return getGeneration() >= AMDGPUSubtarget::GFX9;
367   }
368 
hasMin3Max3_16()369   bool hasMin3Max3_16() const {
370     return getGeneration() >= AMDGPUSubtarget::GFX9;
371   }
372 
hasFmaMixInsts()373   bool hasFmaMixInsts() const {
374     return HasFmaMixInsts;
375   }
376 
hasCARRY()377   bool hasCARRY() const {
378     return true;
379   }
380 
hasFMA()381   bool hasFMA() const {
382     return FMA;
383   }
384 
hasSwap()385   bool hasSwap() const {
386     return GFX9Insts;
387   }
388 
hasScalarPackInsts()389   bool hasScalarPackInsts() const {
390     return GFX9Insts;
391   }
392 
hasScalarMulHiInsts()393   bool hasScalarMulHiInsts() const {
394     return GFX9Insts;
395   }
396 
getTrapHandlerAbi()397   TrapHandlerAbi getTrapHandlerAbi() const {
398     return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
399   }
400 
supportsGetDoorbellID()401   bool supportsGetDoorbellID() const {
402     // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
403     return getGeneration() >= GFX9;
404   }
405 
406   /// True if the offset field of DS instructions works as expected. On SI, the
407   /// offset uses a 16-bit adder and does not always wrap properly.
hasUsableDSOffset()408   bool hasUsableDSOffset() const {
409     return getGeneration() >= SEA_ISLANDS;
410   }
411 
unsafeDSOffsetFoldingEnabled()412   bool unsafeDSOffsetFoldingEnabled() const {
413     return EnableUnsafeDSOffsetFolding;
414   }
415 
416   /// Condition output from div_scale is usable.
hasUsableDivScaleConditionOutput()417   bool hasUsableDivScaleConditionOutput() const {
418     return getGeneration() != SOUTHERN_ISLANDS;
419   }
420 
421   /// Extra wait hazard is needed in some cases before
422   /// s_cbranch_vccnz/s_cbranch_vccz.
hasReadVCCZBug()423   bool hasReadVCCZBug() const {
424     return getGeneration() <= SEA_ISLANDS;
425   }
426 
427   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
partialVCCWritesUpdateVCCZ()428   bool partialVCCWritesUpdateVCCZ() const {
429     return getGeneration() >= GFX10;
430   }
431 
432   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
433   /// was written by a VALU instruction.
hasSMRDReadVALUDefHazard()434   bool hasSMRDReadVALUDefHazard() const {
435     return getGeneration() == SOUTHERN_ISLANDS;
436   }
437 
438   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
439   /// SGPR was written by a VALU Instruction.
hasVMEMReadSGPRVALUDefHazard()440   bool hasVMEMReadSGPRVALUDefHazard() const {
441     return getGeneration() >= VOLCANIC_ISLANDS;
442   }
443 
hasRFEHazards()444   bool hasRFEHazards() const {
445     return getGeneration() >= VOLCANIC_ISLANDS;
446   }
447 
448   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
getSetRegWaitStates()449   unsigned getSetRegWaitStates() const {
450     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
451   }
452 
dumpCode()453   bool dumpCode() const {
454     return DumpCode;
455   }
456 
457   /// Return the amount of LDS that can be used that will not restrict the
458   /// occupancy lower than WaveCount.
459   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
460                                            const Function &) const;
461 
supportsMinMaxDenormModes()462   bool supportsMinMaxDenormModes() const {
463     return getGeneration() >= AMDGPUSubtarget::GFX9;
464   }
465 
466   /// \returns If target supports S_DENORM_MODE.
hasDenormModeInst()467   bool hasDenormModeInst() const {
468     return getGeneration() >= AMDGPUSubtarget::GFX10;
469   }
470 
useFlatForGlobal()471   bool useFlatForGlobal() const {
472     return FlatForGlobal;
473   }
474 
475   /// \returns If target supports ds_read/write_b128 and user enables generation
476   /// of ds_read/write_b128.
useDS128()477   bool useDS128() const {
478     return CIInsts && EnableDS128;
479   }
480 
481   /// \return If target supports ds_read/write_b96/128.
hasDS96AndDS128()482   bool hasDS96AndDS128() const {
483     return CIInsts;
484   }
485 
486   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
haveRoundOpsF64()487   bool haveRoundOpsF64() const {
488     return CIInsts;
489   }
490 
491   /// \returns If MUBUF instructions always perform range checking, even for
492   /// buffer resources used for private memory access.
privateMemoryResourceIsRangeChecked()493   bool privateMemoryResourceIsRangeChecked() const {
494     return getGeneration() < AMDGPUSubtarget::GFX9;
495   }
496 
497   /// \returns If target requires PRT Struct NULL support (zero result registers
498   /// for sparse texture support).
usePRTStrictNull()499   bool usePRTStrictNull() const {
500     return EnablePRTStrictNull;
501   }
502 
hasAutoWaitcntBeforeBarrier()503   bool hasAutoWaitcntBeforeBarrier() const {
504     return AutoWaitcntBeforeBarrier;
505   }
506 
hasUnalignedBufferAccess()507   bool hasUnalignedBufferAccess() const {
508     return UnalignedBufferAccess;
509   }
510 
hasUnalignedBufferAccessEnabled()511   bool hasUnalignedBufferAccessEnabled() const {
512     return UnalignedBufferAccess && UnalignedAccessMode;
513   }
514 
hasUnalignedDSAccess()515   bool hasUnalignedDSAccess() const {
516     return UnalignedDSAccess;
517   }
518 
hasUnalignedDSAccessEnabled()519   bool hasUnalignedDSAccessEnabled() const {
520     return UnalignedDSAccess && UnalignedAccessMode;
521   }
522 
hasUnalignedScratchAccess()523   bool hasUnalignedScratchAccess() const {
524     return UnalignedScratchAccess;
525   }
526 
hasUnalignedAccessMode()527   bool hasUnalignedAccessMode() const {
528     return UnalignedAccessMode;
529   }
530 
hasApertureRegs()531   bool hasApertureRegs() const {
532     return HasApertureRegs;
533   }
534 
isTrapHandlerEnabled()535   bool isTrapHandlerEnabled() const {
536     return TrapHandler;
537   }
538 
isXNACKEnabled()539   bool isXNACKEnabled() const {
540     return TargetID.isXnackOnOrAny();
541   }
542 
isTgSplitEnabled()543   bool isTgSplitEnabled() const {
544     return EnableTgSplit;
545   }
546 
isCuModeEnabled()547   bool isCuModeEnabled() const {
548     return EnableCuMode;
549   }
550 
hasFlatAddressSpace()551   bool hasFlatAddressSpace() const {
552     return FlatAddressSpace;
553   }
554 
hasFlatScrRegister()555   bool hasFlatScrRegister() const {
556     return hasFlatAddressSpace();
557   }
558 
hasFlatInstOffsets()559   bool hasFlatInstOffsets() const {
560     return FlatInstOffsets;
561   }
562 
hasFlatGlobalInsts()563   bool hasFlatGlobalInsts() const {
564     return FlatGlobalInsts;
565   }
566 
hasFlatScratchInsts()567   bool hasFlatScratchInsts() const {
568     return FlatScratchInsts;
569   }
570 
571   // Check if target supports ST addressing mode with FLAT scratch instructions.
572   // The ST addressing mode means no registers are used, either VGPR or SGPR,
573   // but only immediate offset is swizzled and added to the FLAT scratch base.
hasFlatScratchSTMode()574   bool hasFlatScratchSTMode() const {
575     return hasFlatScratchInsts() && hasGFX10_3Insts();
576   }
577 
hasScalarFlatScratchInsts()578   bool hasScalarFlatScratchInsts() const {
579     return ScalarFlatScratchInsts;
580   }
581 
hasGlobalAddTidInsts()582   bool hasGlobalAddTidInsts() const {
583     return GFX10_BEncoding;
584   }
585 
hasAtomicCSub()586   bool hasAtomicCSub() const {
587     return GFX10_BEncoding;
588   }
589 
hasMultiDwordFlatScratchAddressing()590   bool hasMultiDwordFlatScratchAddressing() const {
591     return getGeneration() >= GFX9;
592   }
593 
hasFlatSegmentOffsetBug()594   bool hasFlatSegmentOffsetBug() const {
595     return HasFlatSegmentOffsetBug;
596   }
597 
hasFlatLgkmVMemCountInOrder()598   bool hasFlatLgkmVMemCountInOrder() const {
599     return getGeneration() > GFX9;
600   }
601 
hasD16LoadStore()602   bool hasD16LoadStore() const {
603     return getGeneration() >= GFX9;
604   }
605 
d16PreservesUnusedBits()606   bool d16PreservesUnusedBits() const {
607     return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
608   }
609 
hasD16Images()610   bool hasD16Images() const {
611     return getGeneration() >= VOLCANIC_ISLANDS;
612   }
613 
614   /// Return if most LDS instructions have an m0 use that require m0 to be
615   /// iniitalized.
ldsRequiresM0Init()616   bool ldsRequiresM0Init() const {
617     return getGeneration() < GFX9;
618   }
619 
620   // True if the hardware rewinds and replays GWS operations if a wave is
621   // preempted.
622   //
623   // If this is false, a GWS operation requires testing if a nack set the
624   // MEM_VIOL bit, and repeating if so.
hasGWSAutoReplay()625   bool hasGWSAutoReplay() const {
626     return getGeneration() >= GFX9;
627   }
628 
629   /// \returns if target has ds_gws_sema_release_all instruction.
hasGWSSemaReleaseAll()630   bool hasGWSSemaReleaseAll() const {
631     return CIInsts;
632   }
633 
634   /// \returns true if the target has integer add/sub instructions that do not
635   /// produce a carry-out. This includes v_add_[iu]32, v_sub_[iu]32,
636   /// v_add_[iu]16, and v_sub_[iu]16, all of which support the clamp modifier
637   /// for saturation.
hasAddNoCarry()638   bool hasAddNoCarry() const {
639     return AddNoCarryInsts;
640   }
641 
hasUnpackedD16VMem()642   bool hasUnpackedD16VMem() const {
643     return HasUnpackedD16VMem;
644   }
645 
646   // Covers VS/PS/CS graphics shaders
isMesaGfxShader(const Function & F)647   bool isMesaGfxShader(const Function &F) const {
648     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
649   }
650 
hasMad64_32()651   bool hasMad64_32() const {
652     return getGeneration() >= SEA_ISLANDS;
653   }
654 
hasSDWAOmod()655   bool hasSDWAOmod() const {
656     return HasSDWAOmod;
657   }
658 
hasSDWAScalar()659   bool hasSDWAScalar() const {
660     return HasSDWAScalar;
661   }
662 
hasSDWASdst()663   bool hasSDWASdst() const {
664     return HasSDWASdst;
665   }
666 
hasSDWAMac()667   bool hasSDWAMac() const {
668     return HasSDWAMac;
669   }
670 
hasSDWAOutModsVOPC()671   bool hasSDWAOutModsVOPC() const {
672     return HasSDWAOutModsVOPC;
673   }
674 
hasDLInsts()675   bool hasDLInsts() const {
676     return HasDLInsts;
677   }
678 
hasDot1Insts()679   bool hasDot1Insts() const {
680     return HasDot1Insts;
681   }
682 
hasDot2Insts()683   bool hasDot2Insts() const {
684     return HasDot2Insts;
685   }
686 
hasDot3Insts()687   bool hasDot3Insts() const {
688     return HasDot3Insts;
689   }
690 
hasDot4Insts()691   bool hasDot4Insts() const {
692     return HasDot4Insts;
693   }
694 
hasDot5Insts()695   bool hasDot5Insts() const {
696     return HasDot5Insts;
697   }
698 
hasDot6Insts()699   bool hasDot6Insts() const {
700     return HasDot6Insts;
701   }
702 
hasDot7Insts()703   bool hasDot7Insts() const {
704     return HasDot7Insts;
705   }
706 
hasMAIInsts()707   bool hasMAIInsts() const {
708     return HasMAIInsts;
709   }
710 
hasPkFmacF16Inst()711   bool hasPkFmacF16Inst() const {
712     return HasPkFmacF16Inst;
713   }
714 
hasAtomicFaddInsts()715   bool hasAtomicFaddInsts() const {
716     return HasAtomicFaddInsts;
717   }
718 
hasNoSdstCMPX()719   bool hasNoSdstCMPX() const {
720     return HasNoSdstCMPX;
721   }
722 
hasVscnt()723   bool hasVscnt() const {
724     return HasVscnt;
725   }
726 
hasGetWaveIdInst()727   bool hasGetWaveIdInst() const {
728     return HasGetWaveIdInst;
729   }
730 
hasSMemTimeInst()731   bool hasSMemTimeInst() const {
732     return HasSMemTimeInst;
733   }
734 
hasShaderCyclesRegister()735   bool hasShaderCyclesRegister() const {
736     return HasShaderCyclesRegister;
737   }
738 
hasRegisterBanking()739   bool hasRegisterBanking() const {
740     return HasRegisterBanking;
741   }
742 
hasVOP3Literal()743   bool hasVOP3Literal() const {
744     return HasVOP3Literal;
745   }
746 
hasNoDataDepHazard()747   bool hasNoDataDepHazard() const {
748     return HasNoDataDepHazard;
749   }
750 
vmemWriteNeedsExpWaitcnt()751   bool vmemWriteNeedsExpWaitcnt() const {
752     return getGeneration() < SEA_ISLANDS;
753   }
754 
755   // Scratch is allocated in 256 dword per wave blocks for the entire
756   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
757   // is 4-byte aligned.
758   //
759   // Only 4-byte alignment is really needed to access anything. Transformations
760   // on the pointer value itself may rely on the alignment / known low bits of
761   // the pointer. Set this to something above the minimum to avoid needing
762   // dynamic realignment in common cases.
getStackAlignment()763   Align getStackAlignment() const { return Align(16); }
764 
enableMachineScheduler()765   bool enableMachineScheduler() const override {
766     return true;
767   }
768 
769   bool useAA() const override;
770 
enableSubRegLiveness()771   bool enableSubRegLiveness() const override {
772     return true;
773   }
774 
setScalarizeGlobalBehavior(bool b)775   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
getScalarizeGlobalBehavior()776   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
777 
778   // static wrappers
779   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
780 
781   // XXX - Why is this here if it isn't in the default pass set?
enableEarlyIfConversion()782   bool enableEarlyIfConversion() const override {
783     return true;
784   }
785 
786   bool enableFlatScratch() const;
787 
788   void overrideSchedPolicy(MachineSchedPolicy &Policy,
789                            unsigned NumRegionInstrs) const override;
790 
getMaxNumUserSGPRs()791   unsigned getMaxNumUserSGPRs() const {
792     return 16;
793   }
794 
hasSMemRealTime()795   bool hasSMemRealTime() const {
796     return HasSMemRealTime;
797   }
798 
hasMovrel()799   bool hasMovrel() const {
800     return HasMovrel;
801   }
802 
hasVGPRIndexMode()803   bool hasVGPRIndexMode() const {
804     return HasVGPRIndexMode;
805   }
806 
807   bool useVGPRIndexMode() const;
808 
hasScalarCompareEq64()809   bool hasScalarCompareEq64() const {
810     return getGeneration() >= VOLCANIC_ISLANDS;
811   }
812 
hasScalarStores()813   bool hasScalarStores() const {
814     return HasScalarStores;
815   }
816 
hasScalarAtomics()817   bool hasScalarAtomics() const {
818     return HasScalarAtomics;
819   }
820 
hasLDSFPAtomics()821   bool hasLDSFPAtomics() const {
822     return GFX8Insts;
823   }
824 
825   /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
hasPermLaneX16()826   bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
827 
hasDPP()828   bool hasDPP() const {
829     return HasDPP;
830   }
831 
hasDPPBroadcasts()832   bool hasDPPBroadcasts() const {
833     return HasDPP && getGeneration() < GFX10;
834   }
835 
hasDPPWavefrontShifts()836   bool hasDPPWavefrontShifts() const {
837     return HasDPP && getGeneration() < GFX10;
838   }
839 
hasDPP8()840   bool hasDPP8() const {
841     return HasDPP8;
842   }
843 
has64BitDPP()844   bool has64BitDPP() const {
845     return Has64BitDPP;
846   }
847 
hasPackedFP32Ops()848   bool hasPackedFP32Ops() const {
849     return HasPackedFP32Ops;
850   }
851 
hasFmaakFmamkF32Insts()852   bool hasFmaakFmamkF32Insts() const {
853     return getGeneration() >= GFX10;
854   }
855 
hasExtendedImageInsts()856   bool hasExtendedImageInsts() const {
857     return HasExtendedImageInsts;
858   }
859 
hasR128A16()860   bool hasR128A16() const {
861     return HasR128A16;
862   }
863 
hasGFX10A16()864   bool hasGFX10A16() const {
865     return HasGFX10A16;
866   }
867 
hasA16()868   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
869 
hasG16()870   bool hasG16() const { return HasG16; }
871 
hasOffset3fBug()872   bool hasOffset3fBug() const {
873     return HasOffset3fBug;
874   }
875 
hasImageStoreD16Bug()876   bool hasImageStoreD16Bug() const { return HasImageStoreD16Bug; }
877 
hasImageGather4D16Bug()878   bool hasImageGather4D16Bug() const { return HasImageGather4D16Bug; }
879 
hasNSAEncoding()880   bool hasNSAEncoding() const { return HasNSAEncoding; }
881 
getNSAMaxSize()882   unsigned getNSAMaxSize() const { return NSAMaxSize; }
883 
hasGFX10_AEncoding()884   bool hasGFX10_AEncoding() const {
885     return GFX10_AEncoding;
886   }
887 
hasGFX10_BEncoding()888   bool hasGFX10_BEncoding() const {
889     return GFX10_BEncoding;
890   }
891 
hasGFX10_3Insts()892   bool hasGFX10_3Insts() const {
893     return GFX10_3Insts;
894   }
895 
896   bool hasMadF16() const;
897 
enableSIScheduler()898   bool enableSIScheduler() const {
899     return EnableSIScheduler;
900   }
901 
loadStoreOptEnabled()902   bool loadStoreOptEnabled() const {
903     return EnableLoadStoreOpt;
904   }
905 
hasSGPRInitBug()906   bool hasSGPRInitBug() const {
907     return SGPRInitBug;
908   }
909 
hasNegativeScratchOffsetBug()910   bool hasNegativeScratchOffsetBug() const { return NegativeScratchOffsetBug; }
911 
hasNegativeUnalignedScratchOffsetBug()912   bool hasNegativeUnalignedScratchOffsetBug() const {
913     return NegativeUnalignedScratchOffsetBug;
914   }
915 
hasMFMAInlineLiteralBug()916   bool hasMFMAInlineLiteralBug() const {
917     return HasMFMAInlineLiteralBug;
918   }
919 
has12DWordStoreHazard()920   bool has12DWordStoreHazard() const {
921     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
922   }
923 
924   // \returns true if the subtarget supports DWORDX3 load/store instructions.
hasDwordx3LoadStores()925   bool hasDwordx3LoadStores() const {
926     return CIInsts;
927   }
928 
hasReadM0MovRelInterpHazard()929   bool hasReadM0MovRelInterpHazard() const {
930     return getGeneration() == AMDGPUSubtarget::GFX9;
931   }
932 
hasReadM0SendMsgHazard()933   bool hasReadM0SendMsgHazard() const {
934     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
935            getGeneration() <= AMDGPUSubtarget::GFX9;
936   }
937 
hasVcmpxPermlaneHazard()938   bool hasVcmpxPermlaneHazard() const {
939     return HasVcmpxPermlaneHazard;
940   }
941 
hasVMEMtoScalarWriteHazard()942   bool hasVMEMtoScalarWriteHazard() const {
943     return HasVMEMtoScalarWriteHazard;
944   }
945 
hasSMEMtoVectorWriteHazard()946   bool hasSMEMtoVectorWriteHazard() const {
947     return HasSMEMtoVectorWriteHazard;
948   }
949 
hasLDSMisalignedBug()950   bool hasLDSMisalignedBug() const {
951     return LDSMisalignedBug && !EnableCuMode;
952   }
953 
hasInstFwdPrefetchBug()954   bool hasInstFwdPrefetchBug() const {
955     return HasInstFwdPrefetchBug;
956   }
957 
hasVcmpxExecWARHazard()958   bool hasVcmpxExecWARHazard() const {
959     return HasVcmpxExecWARHazard;
960   }
961 
hasLdsBranchVmemWARHazard()962   bool hasLdsBranchVmemWARHazard() const {
963     return HasLdsBranchVmemWARHazard;
964   }
965 
hasNSAtoVMEMBug()966   bool hasNSAtoVMEMBug() const {
967     return HasNSAtoVMEMBug;
968   }
969 
hasNSAClauseBug()970   bool hasNSAClauseBug() const { return HasNSAClauseBug; }
971 
hasHardClauses()972   bool hasHardClauses() const { return getGeneration() >= GFX10; }
973 
hasGFX90AInsts()974   bool hasGFX90AInsts() const { return GFX90AInsts; }
975 
976   /// Return if operations acting on VGPR tuples require even alignment.
needsAlignedVGPRs()977   bool needsAlignedVGPRs() const { return GFX90AInsts; }
978 
hasPackedTID()979   bool hasPackedTID() const { return HasPackedTID; }
980 
981   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
982   /// SGPRs
983   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
984 
985   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
986   /// VGPRs
987   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
988 
989   /// Return occupancy for the given function. Used LDS and a number of
990   /// registers if provided.
991   /// Note, occupancy can be affected by the scratch allocation as well, but
992   /// we do not have enough information to compute it.
993   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
994                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
995 
996   /// \returns true if the flat_scratch register should be initialized with the
997   /// pointer to the wave's scratch memory rather than a size and offset.
flatScratchIsPointer()998   bool flatScratchIsPointer() const {
999     return getGeneration() >= AMDGPUSubtarget::GFX9;
1000   }
1001 
1002   /// \returns true if the flat_scratch register is initialized by the HW.
1003   /// In this case it is readonly.
flatScratchIsArchitected()1004   bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
1005 
1006   /// \returns true if the machine has merged shaders in which s0-s7 are
1007   /// reserved by the hardware and user SGPRs start at s8
hasMergedShaders()1008   bool hasMergedShaders() const {
1009     return getGeneration() >= GFX9;
1010   }
1011 
1012   /// \returns SGPR allocation granularity supported by the subtarget.
getSGPRAllocGranule()1013   unsigned getSGPRAllocGranule() const {
1014     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1015   }
1016 
1017   /// \returns SGPR encoding granularity supported by the subtarget.
getSGPREncodingGranule()1018   unsigned getSGPREncodingGranule() const {
1019     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1020   }
1021 
1022   /// \returns Total number of SGPRs supported by the subtarget.
getTotalNumSGPRs()1023   unsigned getTotalNumSGPRs() const {
1024     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1025   }
1026 
1027   /// \returns Addressable number of SGPRs supported by the subtarget.
getAddressableNumSGPRs()1028   unsigned getAddressableNumSGPRs() const {
1029     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1030   }
1031 
1032   /// \returns Minimum number of SGPRs that meets the given number of waves per
1033   /// execution unit requirement supported by the subtarget.
getMinNumSGPRs(unsigned WavesPerEU)1034   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1035     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1036   }
1037 
1038   /// \returns Maximum number of SGPRs that meets the given number of waves per
1039   /// execution unit requirement supported by the subtarget.
getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)1040   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1041     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1042   }
1043 
1044   /// \returns Reserved number of SGPRs. This is common
1045   /// utility function called by MachineFunction and
1046   /// Function variants of getReservedNumSGPRs.
1047   unsigned getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const;
1048   /// \returns Reserved number of SGPRs for given machine function \p MF.
1049   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1050 
1051   /// \returns Reserved number of SGPRs for given function \p F.
1052   unsigned getReservedNumSGPRs(const Function &F) const;
1053 
1054   /// \returns max num SGPRs. This is the common utility
1055   /// function called by MachineFunction and Function
1056   /// variants of getMaxNumSGPRs.
1057   unsigned getBaseMaxNumSGPRs(const Function &F,
1058                               std::pair<unsigned, unsigned> WavesPerEU,
1059                               unsigned PreloadedSGPRs,
1060                               unsigned ReservedNumSGPRs) const;
1061 
1062   /// \returns Maximum number of SGPRs that meets number of waves per execution
1063   /// unit requirement for function \p MF, or number of SGPRs explicitly
1064   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1065   ///
1066   /// \returns Value that meets number of waves per execution unit requirement
1067   /// if explicitly requested value cannot be converted to integer, violates
1068   /// subtarget's specifications, or does not meet number of waves per execution
1069   /// unit requirement.
1070   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1071 
1072   /// \returns Maximum number of SGPRs that meets number of waves per execution
1073   /// unit requirement for function \p F, or number of SGPRs explicitly
1074   /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
1075   ///
1076   /// \returns Value that meets number of waves per execution unit requirement
1077   /// if explicitly requested value cannot be converted to integer, violates
1078   /// subtarget's specifications, or does not meet number of waves per execution
1079   /// unit requirement.
1080   unsigned getMaxNumSGPRs(const Function &F) const;
1081 
1082   /// \returns VGPR allocation granularity supported by the subtarget.
getVGPRAllocGranule()1083   unsigned getVGPRAllocGranule() const {
1084     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1085   }
1086 
1087   /// \returns VGPR encoding granularity supported by the subtarget.
getVGPREncodingGranule()1088   unsigned getVGPREncodingGranule() const {
1089     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1090   }
1091 
1092   /// \returns Total number of VGPRs supported by the subtarget.
getTotalNumVGPRs()1093   unsigned getTotalNumVGPRs() const {
1094     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1095   }
1096 
1097   /// \returns Addressable number of VGPRs supported by the subtarget.
getAddressableNumVGPRs()1098   unsigned getAddressableNumVGPRs() const {
1099     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1100   }
1101 
1102   /// \returns Minimum number of VGPRs that meets given number of waves per
1103   /// execution unit requirement supported by the subtarget.
getMinNumVGPRs(unsigned WavesPerEU)1104   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1105     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1106   }
1107 
1108   /// \returns Maximum number of VGPRs that meets given number of waves per
1109   /// execution unit requirement supported by the subtarget.
getMaxNumVGPRs(unsigned WavesPerEU)1110   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1111     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1112   }
1113 
1114   /// \returns max num VGPRs. This is the common utility function
1115   /// called by MachineFunction and Function variants of getMaxNumVGPRs.
1116   unsigned getBaseMaxNumVGPRs(const Function &F,
1117                               std::pair<unsigned, unsigned> WavesPerEU) const;
1118   /// \returns Maximum number of VGPRs that meets number of waves per execution
1119   /// unit requirement for function \p F, or number of VGPRs explicitly
1120   /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
1121   ///
1122   /// \returns Value that meets number of waves per execution unit requirement
1123   /// if explicitly requested value cannot be converted to integer, violates
1124   /// subtarget's specifications, or does not meet number of waves per execution
1125   /// unit requirement.
1126   unsigned getMaxNumVGPRs(const Function &F) const;
1127 
1128   /// \returns Maximum number of VGPRs that meets number of waves per execution
1129   /// unit requirement for function \p MF, or number of VGPRs explicitly
1130   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1131   ///
1132   /// \returns Value that meets number of waves per execution unit requirement
1133   /// if explicitly requested value cannot be converted to integer, violates
1134   /// subtarget's specifications, or does not meet number of waves per execution
1135   /// unit requirement.
1136   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1137 
1138   void getPostRAMutations(
1139       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1140       const override;
1141 
isWave32()1142   bool isWave32() const {
1143     return getWavefrontSize() == 32;
1144   }
1145 
isWave64()1146   bool isWave64() const {
1147     return getWavefrontSize() == 64;
1148   }
1149 
getBoolRC()1150   const TargetRegisterClass *getBoolRC() const {
1151     return getRegisterInfo()->getBoolRC();
1152   }
1153 
1154   /// \returns Maximum number of work groups per compute unit supported by the
1155   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1156   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1157     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1158   }
1159 
1160   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()1161   unsigned getMinFlatWorkGroupSize() const override {
1162     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1163   }
1164 
1165   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()1166   unsigned getMaxFlatWorkGroupSize() const override {
1167     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1168   }
1169 
1170   /// \returns Number of waves per execution unit required to support the given
1171   /// \p FlatWorkGroupSize.
1172   unsigned
getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize)1173   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1174     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1175   }
1176 
1177   /// \returns Minimum number of waves per execution unit supported by the
1178   /// subtarget.
getMinWavesPerEU()1179   unsigned getMinWavesPerEU() const override {
1180     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1181   }
1182 
1183   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1184                              SDep &Dep) const override;
1185 };
1186 
1187 } // end namespace llvm
1188 
1189 #endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
1190