1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //==-----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// AMDGPU specific subclass of TargetSubtarget.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
17 
18 #include "AMDGPU.h"
19 #include "AMDGPUCallLowering.h"
20 #include "R600FrameLowering.h"
21 #include "R600ISelLowering.h"
22 #include "R600InstrInfo.h"
23 #include "SIFrameLowering.h"
24 #include "SIISelLowering.h"
25 #include "SIInstrInfo.h"
26 #include "Utils/AMDGPUBaseInfo.h"
27 #include "llvm/ADT/Triple.h"
28 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
29 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
30 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
33 #include "llvm/MC/MCInstrItineraries.h"
34 #include "llvm/Support/MathExtras.h"
35 #include <cassert>
36 #include <cstdint>
37 #include <memory>
38 #include <utility>
39 
40 #define GET_SUBTARGETINFO_HEADER
41 #include "AMDGPUGenSubtargetInfo.inc"
42 #define GET_SUBTARGETINFO_HEADER
43 #include "R600GenSubtargetInfo.inc"
44 
45 namespace llvm {
46 
47 class StringRef;
48 
49 class AMDGPUSubtarget {
50 public:
51   enum Generation {
52     R600 = 0,
53     R700 = 1,
54     EVERGREEN = 2,
55     NORTHERN_ISLANDS = 3,
56     SOUTHERN_ISLANDS = 4,
57     SEA_ISLANDS = 5,
58     VOLCANIC_ISLANDS = 6,
59     GFX9 = 7
60   };
61 
62 private:
63   Triple TargetTriple;
64 
65 protected:
66   bool Has16BitInsts;
67   bool HasMadMixInsts;
68   bool FP32Denormals;
69   bool FPExceptions;
70   bool HasSDWA;
71   bool HasVOP3PInsts;
72   bool HasMulI24;
73   bool HasMulU24;
74   bool HasInv2PiInlineImm;
75   bool HasFminFmaxLegacy;
76   bool EnablePromoteAlloca;
77   bool HasTrigReducedRange;
78   int LocalMemorySize;
79   unsigned WavefrontSize;
80 
81 public:
82   AMDGPUSubtarget(const Triple &TT);
83 
84   static const AMDGPUSubtarget &get(const MachineFunction &MF);
85   static const AMDGPUSubtarget &get(const TargetMachine &TM,
86                                     const Function &F);
87 
88   /// \returns Default range flat work group size for a calling convention.
89   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
90 
91   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
92   /// for function \p F, or minimum/maximum flat work group sizes explicitly
93   /// requested using "amdgpu-flat-work-group-size" attribute attached to
94   /// function \p F.
95   ///
96   /// \returns Subtarget's default values if explicitly requested values cannot
97   /// be converted to integer, or violate subtarget's specifications.
98   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
99 
100   /// \returns Subtarget's default pair of minimum/maximum number of waves per
101   /// execution unit for function \p F, or minimum/maximum number of waves per
102   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
103   /// attached to function \p F.
104   ///
105   /// \returns Subtarget's default values if explicitly requested values cannot
106   /// be converted to integer, violate subtarget's specifications, or are not
107   /// compatible with minimum/maximum number of waves limited by flat work group
108   /// size, register usage, and/or lds usage.
109   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
110 
111   /// Return the amount of LDS that can be used that will not restrict the
112   /// occupancy lower than WaveCount.
113   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
114                                            const Function &) const;
115 
116   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
117   /// the given LDS memory size is the only constraint.
118   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
119 
120   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
121 
isAmdHsaOS()122   bool isAmdHsaOS() const {
123     return TargetTriple.getOS() == Triple::AMDHSA;
124   }
125 
isAmdPalOS()126   bool isAmdPalOS() const {
127     return TargetTriple.getOS() == Triple::AMDPAL;
128   }
129 
isMesa3DOS()130   bool isMesa3DOS() const {
131     return TargetTriple.getOS() == Triple::Mesa3D;
132   }
133 
isMesaKernel(const Function & F)134   bool isMesaKernel(const Function &F) const {
135     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
136   }
137 
isAmdHsaOrMesa(const Function & F)138   bool isAmdHsaOrMesa(const Function &F) const {
139     return isAmdHsaOS() || isMesaKernel(F);
140   }
141 
has16BitInsts()142   bool has16BitInsts() const {
143     return Has16BitInsts;
144   }
145 
hasMadMixInsts()146   bool hasMadMixInsts() const {
147     return HasMadMixInsts;
148   }
149 
hasFP32Denormals()150   bool hasFP32Denormals() const {
151     return FP32Denormals;
152   }
153 
hasFPExceptions()154   bool hasFPExceptions() const {
155     return FPExceptions;
156   }
157 
hasSDWA()158   bool hasSDWA() const {
159     return HasSDWA;
160   }
161 
hasVOP3PInsts()162   bool hasVOP3PInsts() const {
163     return HasVOP3PInsts;
164   }
165 
hasMulI24()166   bool hasMulI24() const {
167     return HasMulI24;
168   }
169 
hasMulU24()170   bool hasMulU24() const {
171     return HasMulU24;
172   }
173 
hasInv2PiInlineImm()174   bool hasInv2PiInlineImm() const {
175     return HasInv2PiInlineImm;
176   }
177 
hasFminFmaxLegacy()178   bool hasFminFmaxLegacy() const {
179     return HasFminFmaxLegacy;
180   }
181 
hasTrigReducedRange()182   bool hasTrigReducedRange() const {
183     return HasTrigReducedRange;
184   }
185 
isPromoteAllocaEnabled()186   bool isPromoteAllocaEnabled() const {
187     return EnablePromoteAlloca;
188   }
189 
getWavefrontSize()190   unsigned getWavefrontSize() const {
191     return WavefrontSize;
192   }
193 
getLocalMemorySize()194   int getLocalMemorySize() const {
195     return LocalMemorySize;
196   }
197 
getAlignmentForImplicitArgPtr()198   unsigned getAlignmentForImplicitArgPtr() const {
199     return isAmdHsaOS() ? 8 : 4;
200   }
201 
202   /// Returns the offset in bytes from the start of the input buffer
203   ///        of the first explicit kernel argument.
getExplicitKernelArgOffset(const Function & F)204   unsigned getExplicitKernelArgOffset(const Function &F) const {
205     return isAmdHsaOrMesa(F) ? 0 : 36;
206   }
207 
208   /// \returns Maximum number of work groups per compute unit supported by the
209   /// subtarget and limited by given \p FlatWorkGroupSize.
210   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
211 
212   /// \returns Minimum flat work group size supported by the subtarget.
213   virtual unsigned getMinFlatWorkGroupSize() const = 0;
214 
215   /// \returns Maximum flat work group size supported by the subtarget.
216   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
217 
218   /// \returns Maximum number of waves per execution unit supported by the
219   /// subtarget and limited by given \p FlatWorkGroupSize.
220   virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
221 
222   /// \returns Minimum number of waves per execution unit supported by the
223   /// subtarget.
224   virtual unsigned getMinWavesPerEU() const = 0;
225 
getMaxWavesPerEU()226   unsigned getMaxWavesPerEU() const { return 10; }
227 
228   /// Creates value range metadata on an workitemid.* inrinsic call or load.
229   bool makeLIDRangeMetadata(Instruction *I) const;
230 
231   /// \returns Number of bytes of arguments that are passed to a shader or
232   /// kernel in addition to the explicit ones declared for the function.
getImplicitArgNumBytes(const Function & F)233   unsigned getImplicitArgNumBytes(const Function &F) const {
234     if (isMesaKernel(F))
235       return 16;
236     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
237   }
238   uint64_t getExplicitKernArgSize(const Function &F,
239                                   unsigned &MaxAlign) const;
240   unsigned getKernArgSegmentSize(const Function &F,
241                                  unsigned &MaxAlign) const;
242 
~AMDGPUSubtarget()243   virtual ~AMDGPUSubtarget() {}
244 };
245 
246 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
247                      public AMDGPUSubtarget {
248 public:
249   enum {
250     ISAVersion0_0_0,
251     ISAVersion6_0_0,
252     ISAVersion6_0_1,
253     ISAVersion7_0_0,
254     ISAVersion7_0_1,
255     ISAVersion7_0_2,
256     ISAVersion7_0_3,
257     ISAVersion7_0_4,
258     ISAVersion8_0_1,
259     ISAVersion8_0_2,
260     ISAVersion8_0_3,
261     ISAVersion8_1_0,
262     ISAVersion9_0_0,
263     ISAVersion9_0_2,
264     ISAVersion9_0_4,
265     ISAVersion9_0_6,
266     ISAVersion9_0_9,
267   };
268 
269   enum TrapHandlerAbi {
270     TrapHandlerAbiNone = 0,
271     TrapHandlerAbiHsa = 1
272   };
273 
274   enum TrapID {
275     TrapIDHardwareReserved = 0,
276     TrapIDHSADebugTrap = 1,
277     TrapIDLLVMTrap = 2,
278     TrapIDLLVMDebugTrap = 3,
279     TrapIDDebugBreakpoint = 7,
280     TrapIDDebugReserved8 = 8,
281     TrapIDDebugReservedFE = 0xfe,
282     TrapIDDebugReservedFF = 0xff
283   };
284 
285   enum TrapRegValues {
286     LLVMTrapHandlerRegValue = 1
287   };
288 
289 private:
290   /// GlobalISel related APIs.
291   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
292   std::unique_ptr<InstructionSelector> InstSelector;
293   std::unique_ptr<LegalizerInfo> Legalizer;
294   std::unique_ptr<RegisterBankInfo> RegBankInfo;
295 
296 protected:
297   // Basic subtarget description.
298   Triple TargetTriple;
299   unsigned Gen;
300   unsigned IsaVersion;
301   InstrItineraryData InstrItins;
302   int LDSBankCount;
303   unsigned MaxPrivateElementSize;
304 
305   // Possibly statically set by tablegen, but may want to be overridden.
306   bool FastFMAF32;
307   bool HalfRate64Ops;
308 
309   // Dynamially set bits that enable features.
310   bool FP64FP16Denormals;
311   bool DX10Clamp;
312   bool FlatForGlobal;
313   bool AutoWaitcntBeforeBarrier;
314   bool CodeObjectV3;
315   bool UnalignedScratchAccess;
316   bool UnalignedBufferAccess;
317   bool HasApertureRegs;
318   bool EnableXNACK;
319   bool TrapHandler;
320   bool DebuggerInsertNops;
321   bool DebuggerEmitPrologue;
322 
323   // Used as options.
324   bool EnableHugePrivateBuffer;
325   bool EnableLoadStoreOpt;
326   bool EnableUnsafeDSOffsetFolding;
327   bool EnableSIScheduler;
328   bool EnableDS128;
329   bool EnablePRTStrictNull;
330   bool DumpCode;
331 
332   // Subtarget statically properties set by tablegen
333   bool FP64;
334   bool FMA;
335   bool MIMG_R128;
336   bool IsGCN;
337   bool GCN3Encoding;
338   bool CIInsts;
339   bool VIInsts;
340   bool GFX9Insts;
341   bool SGPRInitBug;
342   bool HasSMemRealTime;
343   bool HasIntClamp;
344   bool HasFmaMixInsts;
345   bool HasMovrel;
346   bool HasVGPRIndexMode;
347   bool HasScalarStores;
348   bool HasScalarAtomics;
349   bool HasSDWAOmod;
350   bool HasSDWAScalar;
351   bool HasSDWASdst;
352   bool HasSDWAMac;
353   bool HasSDWAOutModsVOPC;
354   bool HasDPP;
355   bool HasR128A16;
356   bool HasDLInsts;
357   bool HasDotInsts;
358   bool EnableSRAMECC;
359   bool FlatAddressSpace;
360   bool FlatInstOffsets;
361   bool FlatGlobalInsts;
362   bool FlatScratchInsts;
363   bool AddNoCarryInsts;
364   bool HasUnpackedD16VMem;
365   bool R600ALUInst;
366   bool CaymanISA;
367   bool CFALUBug;
368   bool HasVertexCache;
369   short TexVTXClauseSize;
370   bool ScalarizeGlobal;
371 
372   // Dummy feature to use for assembler in tablegen.
373   bool FeatureDisable;
374 
375   SelectionDAGTargetInfo TSInfo;
376 private:
377   SIInstrInfo InstrInfo;
378   SITargetLowering TLInfo;
379   SIFrameLowering FrameLowering;
380 
381 public:
382   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
383                const GCNTargetMachine &TM);
384   ~GCNSubtarget() override;
385 
386   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
387                                                    StringRef GPU, StringRef FS);
388 
getInstrInfo()389   const SIInstrInfo *getInstrInfo() const override {
390     return &InstrInfo;
391   }
392 
getFrameLowering()393   const SIFrameLowering *getFrameLowering() const override {
394     return &FrameLowering;
395   }
396 
getTargetLowering()397   const SITargetLowering *getTargetLowering() const override {
398     return &TLInfo;
399   }
400 
getRegisterInfo()401   const SIRegisterInfo *getRegisterInfo() const override {
402     return &InstrInfo.getRegisterInfo();
403   }
404 
getCallLowering()405   const CallLowering *getCallLowering() const override {
406     return CallLoweringInfo.get();
407   }
408 
getInstructionSelector()409   const InstructionSelector *getInstructionSelector() const override {
410     return InstSelector.get();
411   }
412 
getLegalizerInfo()413   const LegalizerInfo *getLegalizerInfo() const override {
414     return Legalizer.get();
415   }
416 
getRegBankInfo()417   const RegisterBankInfo *getRegBankInfo() const override {
418     return RegBankInfo.get();
419   }
420 
421   // Nothing implemented, just prevent crashes on use.
getSelectionDAGInfo()422   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
423     return &TSInfo;
424   }
425 
getInstrItineraryData()426   const InstrItineraryData *getInstrItineraryData() const override {
427     return &InstrItins;
428   }
429 
430   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
431 
getGeneration()432   Generation getGeneration() const {
433     return (Generation)Gen;
434   }
435 
getWavefrontSizeLog2()436   unsigned getWavefrontSizeLog2() const {
437     return Log2_32(WavefrontSize);
438   }
439 
getLDSBankCount()440   int getLDSBankCount() const {
441     return LDSBankCount;
442   }
443 
getMaxPrivateElementSize()444   unsigned getMaxPrivateElementSize() const {
445     return MaxPrivateElementSize;
446   }
447 
hasIntClamp()448   bool hasIntClamp() const {
449     return HasIntClamp;
450   }
451 
hasFP64()452   bool hasFP64() const {
453     return FP64;
454   }
455 
hasMIMG_R128()456   bool hasMIMG_R128() const {
457     return MIMG_R128;
458   }
459 
hasHWFP64()460   bool hasHWFP64() const {
461     return FP64;
462   }
463 
hasFastFMAF32()464   bool hasFastFMAF32() const {
465     return FastFMAF32;
466   }
467 
hasHalfRate64Ops()468   bool hasHalfRate64Ops() const {
469     return HalfRate64Ops;
470   }
471 
hasAddr64()472   bool hasAddr64() const {
473     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
474   }
475 
hasBFE()476   bool hasBFE() const {
477     return true;
478   }
479 
hasBFI()480   bool hasBFI() const {
481     return true;
482   }
483 
hasBFM()484   bool hasBFM() const {
485     return hasBFE();
486   }
487 
hasBCNT(unsigned Size)488   bool hasBCNT(unsigned Size) const {
489     return true;
490   }
491 
hasFFBL()492   bool hasFFBL() const {
493     return true;
494   }
495 
hasFFBH()496   bool hasFFBH() const {
497     return true;
498   }
499 
hasMed3_16()500   bool hasMed3_16() const {
501     return getGeneration() >= AMDGPUSubtarget::GFX9;
502   }
503 
hasMin3Max3_16()504   bool hasMin3Max3_16() const {
505     return getGeneration() >= AMDGPUSubtarget::GFX9;
506   }
507 
hasFmaMixInsts()508   bool hasFmaMixInsts() const {
509     return HasFmaMixInsts;
510   }
511 
hasCARRY()512   bool hasCARRY() const {
513     return true;
514   }
515 
hasFMA()516   bool hasFMA() const {
517     return FMA;
518   }
519 
hasSwap()520   bool hasSwap() const {
521     return GFX9Insts;
522   }
523 
getTrapHandlerAbi()524   TrapHandlerAbi getTrapHandlerAbi() const {
525     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
526   }
527 
enableHugePrivateBuffer()528   bool enableHugePrivateBuffer() const {
529     return EnableHugePrivateBuffer;
530   }
531 
unsafeDSOffsetFoldingEnabled()532   bool unsafeDSOffsetFoldingEnabled() const {
533     return EnableUnsafeDSOffsetFolding;
534   }
535 
dumpCode()536   bool dumpCode() const {
537     return DumpCode;
538   }
539 
540   /// Return the amount of LDS that can be used that will not restrict the
541   /// occupancy lower than WaveCount.
542   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
543                                            const Function &) const;
544 
hasFP16Denormals()545   bool hasFP16Denormals() const {
546     return FP64FP16Denormals;
547   }
548 
hasFP64Denormals()549   bool hasFP64Denormals() const {
550     return FP64FP16Denormals;
551   }
552 
supportsMinMaxDenormModes()553   bool supportsMinMaxDenormModes() const {
554     return getGeneration() >= AMDGPUSubtarget::GFX9;
555   }
556 
enableDX10Clamp()557   bool enableDX10Clamp() const {
558     return DX10Clamp;
559   }
560 
enableIEEEBit(const MachineFunction & MF)561   bool enableIEEEBit(const MachineFunction &MF) const {
562     return AMDGPU::isCompute(MF.getFunction().getCallingConv());
563   }
564 
useFlatForGlobal()565   bool useFlatForGlobal() const {
566     return FlatForGlobal;
567   }
568 
569   /// \returns If target supports ds_read/write_b128 and user enables generation
570   /// of ds_read/write_b128.
useDS128()571   bool useDS128() const {
572     return CIInsts && EnableDS128;
573   }
574 
575   /// \returns If MUBUF instructions always perform range checking, even for
576   /// buffer resources used for private memory access.
privateMemoryResourceIsRangeChecked()577   bool privateMemoryResourceIsRangeChecked() const {
578     return getGeneration() < AMDGPUSubtarget::GFX9;
579   }
580 
581   /// \returns If target requires PRT Struct NULL support (zero result registers
582   /// for sparse texture support).
usePRTStrictNull()583   bool usePRTStrictNull() const {
584     return EnablePRTStrictNull;
585   }
586 
hasAutoWaitcntBeforeBarrier()587   bool hasAutoWaitcntBeforeBarrier() const {
588     return AutoWaitcntBeforeBarrier;
589   }
590 
hasCodeObjectV3()591   bool hasCodeObjectV3() const {
592     // FIXME: Need to add code object v3 support for mesa and pal.
593     return isAmdHsaOS() ? CodeObjectV3 : false;
594   }
595 
hasUnalignedBufferAccess()596   bool hasUnalignedBufferAccess() const {
597     return UnalignedBufferAccess;
598   }
599 
hasUnalignedScratchAccess()600   bool hasUnalignedScratchAccess() const {
601     return UnalignedScratchAccess;
602   }
603 
hasApertureRegs()604   bool hasApertureRegs() const {
605     return HasApertureRegs;
606   }
607 
isTrapHandlerEnabled()608   bool isTrapHandlerEnabled() const {
609     return TrapHandler;
610   }
611 
isXNACKEnabled()612   bool isXNACKEnabled() const {
613     return EnableXNACK;
614   }
615 
hasFlatAddressSpace()616   bool hasFlatAddressSpace() const {
617     return FlatAddressSpace;
618   }
619 
hasFlatInstOffsets()620   bool hasFlatInstOffsets() const {
621     return FlatInstOffsets;
622   }
623 
hasFlatGlobalInsts()624   bool hasFlatGlobalInsts() const {
625     return FlatGlobalInsts;
626   }
627 
hasFlatScratchInsts()628   bool hasFlatScratchInsts() const {
629     return FlatScratchInsts;
630   }
631 
hasFlatLgkmVMemCountInOrder()632   bool hasFlatLgkmVMemCountInOrder() const {
633     return getGeneration() > GFX9;
634   }
635 
hasD16LoadStore()636   bool hasD16LoadStore() const {
637     return getGeneration() >= GFX9;
638   }
639 
640   /// Return if most LDS instructions have an m0 use that require m0 to be
641   /// iniitalized.
ldsRequiresM0Init()642   bool ldsRequiresM0Init() const {
643     return getGeneration() < GFX9;
644   }
645 
hasAddNoCarry()646   bool hasAddNoCarry() const {
647     return AddNoCarryInsts;
648   }
649 
hasUnpackedD16VMem()650   bool hasUnpackedD16VMem() const {
651     return HasUnpackedD16VMem;
652   }
653 
654   // Covers VS/PS/CS graphics shaders
isMesaGfxShader(const Function & F)655   bool isMesaGfxShader(const Function &F) const {
656     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
657   }
658 
hasMad64_32()659   bool hasMad64_32() const {
660     return getGeneration() >= SEA_ISLANDS;
661   }
662 
hasSDWAOmod()663   bool hasSDWAOmod() const {
664     return HasSDWAOmod;
665   }
666 
hasSDWAScalar()667   bool hasSDWAScalar() const {
668     return HasSDWAScalar;
669   }
670 
hasSDWASdst()671   bool hasSDWASdst() const {
672     return HasSDWASdst;
673   }
674 
hasSDWAMac()675   bool hasSDWAMac() const {
676     return HasSDWAMac;
677   }
678 
hasSDWAOutModsVOPC()679   bool hasSDWAOutModsVOPC() const {
680     return HasSDWAOutModsVOPC;
681   }
682 
vmemWriteNeedsExpWaitcnt()683   bool vmemWriteNeedsExpWaitcnt() const {
684     return getGeneration() < SEA_ISLANDS;
685   }
686 
hasDLInsts()687   bool hasDLInsts() const {
688     return HasDLInsts;
689   }
690 
hasDotInsts()691   bool hasDotInsts() const {
692     return HasDotInsts;
693   }
694 
isSRAMECCEnabled()695   bool isSRAMECCEnabled() const {
696     return EnableSRAMECC;
697   }
698 
699   // Scratch is allocated in 256 dword per wave blocks for the entire
700   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
701   // is 4-byte aligned.
702   //
703   // Only 4-byte alignment is really needed to access anything. Transformations
704   // on the pointer value itself may rely on the alignment / known low bits of
705   // the pointer. Set this to something above the minimum to avoid needing
706   // dynamic realignment in common cases.
getStackAlignment()707   unsigned getStackAlignment() const {
708     return 16;
709   }
710 
enableMachineScheduler()711   bool enableMachineScheduler() const override {
712     return true;
713   }
714 
enableSubRegLiveness()715   bool enableSubRegLiveness() const override {
716     return true;
717   }
718 
setScalarizeGlobalBehavior(bool b)719   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
getScalarizeGlobalBehavior()720   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
721 
722   /// \returns Number of execution units per compute unit supported by the
723   /// subtarget.
getEUsPerCU()724   unsigned getEUsPerCU() const {
725     return AMDGPU::IsaInfo::getEUsPerCU(this);
726   }
727 
728   /// \returns Maximum number of waves per compute unit supported by the
729   /// subtarget without any kind of limitation.
getMaxWavesPerCU()730   unsigned getMaxWavesPerCU() const {
731     return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
732   }
733 
734   /// \returns Maximum number of waves per compute unit supported by the
735   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWavesPerCU(unsigned FlatWorkGroupSize)736   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
737     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
738   }
739 
740   /// \returns Maximum number of waves per execution unit supported by the
741   /// subtarget without any kind of limitation.
getMaxWavesPerEU()742   unsigned getMaxWavesPerEU() const {
743     return AMDGPU::IsaInfo::getMaxWavesPerEU();
744   }
745 
746   /// \returns Number of waves per work group supported by the subtarget and
747   /// limited by given \p FlatWorkGroupSize.
getWavesPerWorkGroup(unsigned FlatWorkGroupSize)748   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
749     return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
750   }
751 
752   // static wrappers
753   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
754 
755   // XXX - Why is this here if it isn't in the default pass set?
enableEarlyIfConversion()756   bool enableEarlyIfConversion() const override {
757     return true;
758   }
759 
760   void overrideSchedPolicy(MachineSchedPolicy &Policy,
761                            unsigned NumRegionInstrs) const override;
762 
getMaxNumUserSGPRs()763   unsigned getMaxNumUserSGPRs() const {
764     return 16;
765   }
766 
hasSMemRealTime()767   bool hasSMemRealTime() const {
768     return HasSMemRealTime;
769   }
770 
hasMovrel()771   bool hasMovrel() const {
772     return HasMovrel;
773   }
774 
hasVGPRIndexMode()775   bool hasVGPRIndexMode() const {
776     return HasVGPRIndexMode;
777   }
778 
useVGPRIndexMode(bool UserEnable)779   bool useVGPRIndexMode(bool UserEnable) const {
780     return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
781   }
782 
hasScalarCompareEq64()783   bool hasScalarCompareEq64() const {
784     return getGeneration() >= VOLCANIC_ISLANDS;
785   }
786 
hasScalarStores()787   bool hasScalarStores() const {
788     return HasScalarStores;
789   }
790 
hasScalarAtomics()791   bool hasScalarAtomics() const {
792     return HasScalarAtomics;
793   }
794 
795 
hasDPP()796   bool hasDPP() const {
797     return HasDPP;
798   }
799 
hasR128A16()800   bool hasR128A16() const {
801     return HasR128A16;
802   }
803 
enableSIScheduler()804   bool enableSIScheduler() const {
805     return EnableSIScheduler;
806   }
807 
debuggerSupported()808   bool debuggerSupported() const {
809     return debuggerInsertNops() && debuggerEmitPrologue();
810   }
811 
debuggerInsertNops()812   bool debuggerInsertNops() const {
813     return DebuggerInsertNops;
814   }
815 
debuggerEmitPrologue()816   bool debuggerEmitPrologue() const {
817     return DebuggerEmitPrologue;
818   }
819 
loadStoreOptEnabled()820   bool loadStoreOptEnabled() const {
821     return EnableLoadStoreOpt;
822   }
823 
hasSGPRInitBug()824   bool hasSGPRInitBug() const {
825     return SGPRInitBug;
826   }
827 
has12DWordStoreHazard()828   bool has12DWordStoreHazard() const {
829     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
830   }
831 
832   // \returns true if the subtarget supports DWORDX3 load/store instructions.
hasDwordx3LoadStores()833   bool hasDwordx3LoadStores() const {
834     return CIInsts;
835   }
836 
hasSMovFedHazard()837   bool hasSMovFedHazard() const {
838     return getGeneration() >= AMDGPUSubtarget::GFX9;
839   }
840 
hasReadM0MovRelInterpHazard()841   bool hasReadM0MovRelInterpHazard() const {
842     return getGeneration() >= AMDGPUSubtarget::GFX9;
843   }
844 
hasReadM0SendMsgHazard()845   bool hasReadM0SendMsgHazard() const {
846     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS;
847   }
848 
849   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
850   /// SGPRs
851   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
852 
853   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
854   /// VGPRs
855   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
856 
857   /// \returns true if the flat_scratch register should be initialized with the
858   /// pointer to the wave's scratch memory rather than a size and offset.
flatScratchIsPointer()859   bool flatScratchIsPointer() const {
860     return getGeneration() >= AMDGPUSubtarget::GFX9;
861   }
862 
863   /// \returns true if the machine has merged shaders in which s0-s7 are
864   /// reserved by the hardware and user SGPRs start at s8
hasMergedShaders()865   bool hasMergedShaders() const {
866     return getGeneration() >= GFX9;
867   }
868 
869   /// \returns SGPR allocation granularity supported by the subtarget.
getSGPRAllocGranule()870   unsigned getSGPRAllocGranule() const {
871     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
872   }
873 
874   /// \returns SGPR encoding granularity supported by the subtarget.
getSGPREncodingGranule()875   unsigned getSGPREncodingGranule() const {
876     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
877   }
878 
879   /// \returns Total number of SGPRs supported by the subtarget.
getTotalNumSGPRs()880   unsigned getTotalNumSGPRs() const {
881     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
882   }
883 
884   /// \returns Addressable number of SGPRs supported by the subtarget.
getAddressableNumSGPRs()885   unsigned getAddressableNumSGPRs() const {
886     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
887   }
888 
889   /// \returns Minimum number of SGPRs that meets the given number of waves per
890   /// execution unit requirement supported by the subtarget.
getMinNumSGPRs(unsigned WavesPerEU)891   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
892     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
893   }
894 
895   /// \returns Maximum number of SGPRs that meets the given number of waves per
896   /// execution unit requirement supported by the subtarget.
getMaxNumSGPRs(unsigned WavesPerEU,bool Addressable)897   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
898     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
899   }
900 
901   /// \returns Reserved number of SGPRs for given function \p MF.
902   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
903 
904   /// \returns Maximum number of SGPRs that meets number of waves per execution
905   /// unit requirement for function \p MF, or number of SGPRs explicitly
906   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
907   ///
908   /// \returns Value that meets number of waves per execution unit requirement
909   /// if explicitly requested value cannot be converted to integer, violates
910   /// subtarget's specifications, or does not meet number of waves per execution
911   /// unit requirement.
912   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
913 
914   /// \returns VGPR allocation granularity supported by the subtarget.
getVGPRAllocGranule()915   unsigned getVGPRAllocGranule() const {
916     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
917   }
918 
919   /// \returns VGPR encoding granularity supported by the subtarget.
getVGPREncodingGranule()920   unsigned getVGPREncodingGranule() const {
921     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
922   }
923 
924   /// \returns Total number of VGPRs supported by the subtarget.
getTotalNumVGPRs()925   unsigned getTotalNumVGPRs() const {
926     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
927   }
928 
929   /// \returns Addressable number of VGPRs supported by the subtarget.
getAddressableNumVGPRs()930   unsigned getAddressableNumVGPRs() const {
931     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
932   }
933 
934   /// \returns Minimum number of VGPRs that meets given number of waves per
935   /// execution unit requirement supported by the subtarget.
getMinNumVGPRs(unsigned WavesPerEU)936   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
937     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
938   }
939 
940   /// \returns Maximum number of VGPRs that meets given number of waves per
941   /// execution unit requirement supported by the subtarget.
getMaxNumVGPRs(unsigned WavesPerEU)942   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
943     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
944   }
945 
946   /// \returns Maximum number of VGPRs that meets number of waves per execution
947   /// unit requirement for function \p MF, or number of VGPRs explicitly
948   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
949   ///
950   /// \returns Value that meets number of waves per execution unit requirement
951   /// if explicitly requested value cannot be converted to integer, violates
952   /// subtarget's specifications, or does not meet number of waves per execution
953   /// unit requirement.
954   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
955 
956   void getPostRAMutations(
957       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
958       const override;
959 
960   /// \returns Maximum number of work groups per compute unit supported by the
961   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)962   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
963     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
964   }
965 
966   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()967   unsigned getMinFlatWorkGroupSize() const override {
968     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
969   }
970 
971   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()972   unsigned getMaxFlatWorkGroupSize() const override {
973     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
974   }
975 
976   /// \returns Maximum number of waves per execution unit supported by the
977   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWavesPerEU(unsigned FlatWorkGroupSize)978   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
979     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
980   }
981 
982   /// \returns Minimum number of waves per execution unit supported by the
983   /// subtarget.
getMinWavesPerEU()984   unsigned getMinWavesPerEU() const override {
985     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
986   }
987 };
988 
989 class R600Subtarget final : public R600GenSubtargetInfo,
990                             public AMDGPUSubtarget {
991 private:
992   R600InstrInfo InstrInfo;
993   R600FrameLowering FrameLowering;
994   bool FMA;
995   bool CaymanISA;
996   bool CFALUBug;
997   bool DX10Clamp;
998   bool HasVertexCache;
999   bool R600ALUInst;
1000   bool FP64;
1001   short TexVTXClauseSize;
1002   Generation Gen;
1003   R600TargetLowering TLInfo;
1004   InstrItineraryData InstrItins;
1005   SelectionDAGTargetInfo TSInfo;
1006 
1007 public:
1008   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
1009                 const TargetMachine &TM);
1010 
getInstrInfo()1011   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
1012 
getFrameLowering()1013   const R600FrameLowering *getFrameLowering() const override {
1014     return &FrameLowering;
1015   }
1016 
getTargetLowering()1017   const R600TargetLowering *getTargetLowering() const override {
1018     return &TLInfo;
1019   }
1020 
getRegisterInfo()1021   const R600RegisterInfo *getRegisterInfo() const override {
1022     return &InstrInfo.getRegisterInfo();
1023   }
1024 
getInstrItineraryData()1025   const InstrItineraryData *getInstrItineraryData() const override {
1026     return &InstrItins;
1027   }
1028 
1029   // Nothing implemented, just prevent crashes on use.
getSelectionDAGInfo()1030   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1031     return &TSInfo;
1032   }
1033 
1034   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1035 
getGeneration()1036   Generation getGeneration() const {
1037     return Gen;
1038   }
1039 
getStackAlignment()1040   unsigned getStackAlignment() const {
1041     return 4;
1042   }
1043 
1044   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1045                                                  StringRef GPU, StringRef FS);
1046 
hasBFE()1047   bool hasBFE() const {
1048     return (getGeneration() >= EVERGREEN);
1049   }
1050 
hasBFI()1051   bool hasBFI() const {
1052     return (getGeneration() >= EVERGREEN);
1053   }
1054 
hasBCNT(unsigned Size)1055   bool hasBCNT(unsigned Size) const {
1056     if (Size == 32)
1057       return (getGeneration() >= EVERGREEN);
1058 
1059     return false;
1060   }
1061 
hasBORROW()1062   bool hasBORROW() const {
1063     return (getGeneration() >= EVERGREEN);
1064   }
1065 
hasCARRY()1066   bool hasCARRY() const {
1067     return (getGeneration() >= EVERGREEN);
1068   }
1069 
hasCaymanISA()1070   bool hasCaymanISA() const {
1071     return CaymanISA;
1072   }
1073 
hasFFBL()1074   bool hasFFBL() const {
1075     return (getGeneration() >= EVERGREEN);
1076   }
1077 
hasFFBH()1078   bool hasFFBH() const {
1079     return (getGeneration() >= EVERGREEN);
1080   }
1081 
hasFMA()1082   bool hasFMA() const { return FMA; }
1083 
hasCFAluBug()1084   bool hasCFAluBug() const { return CFALUBug; }
1085 
hasVertexCache()1086   bool hasVertexCache() const { return HasVertexCache; }
1087 
getTexVTXClauseSize()1088   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1089 
enableMachineScheduler()1090   bool enableMachineScheduler() const override {
1091     return true;
1092   }
1093 
enableSubRegLiveness()1094   bool enableSubRegLiveness() const override {
1095     return true;
1096   }
1097 
1098   /// \returns Maximum number of work groups per compute unit supported by the
1099   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize)1100   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1101     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1102   }
1103 
1104   /// \returns Minimum flat work group size supported by the subtarget.
getMinFlatWorkGroupSize()1105   unsigned getMinFlatWorkGroupSize() const override {
1106     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1107   }
1108 
1109   /// \returns Maximum flat work group size supported by the subtarget.
getMaxFlatWorkGroupSize()1110   unsigned getMaxFlatWorkGroupSize() const override {
1111     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1112   }
1113 
1114   /// \returns Maximum number of waves per execution unit supported by the
1115   /// subtarget and limited by given \p FlatWorkGroupSize.
getMaxWavesPerEU(unsigned FlatWorkGroupSize)1116   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1117     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1118   }
1119 
1120   /// \returns Minimum number of waves per execution unit supported by the
1121   /// subtarget.
getMinWavesPerEU()1122   unsigned getMinWavesPerEU() const override {
1123     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1124   }
1125 };
1126 
1127 } // end namespace llvm
1128 
1129 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
1130