1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "R600FrameLowering.h"
21 #include "R600ISelLowering.h"
22 #include "R600InstrInfo.h"
23 #include "SIFrameLowering.h"
24 #include "SIISelLowering.h"
25 #include "SIInstrInfo.h"
26 #include "Utils/AMDGPUBaseInfo.h"
27 #include "llvm/ADT/Triple.h"
28 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
29 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
30 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
31 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
34 #include "llvm/MC/MCInstrItineraries.h"
35 #include "llvm/Support/MathExtras.h"
36 #include <cassert>
37 #include <cstdint>
38 #include <memory>
39 #include <utility>
40 
41 #define GET_SUBTARGETINFO_HEADER
42 #include "AMDGPUGenSubtargetInfo.inc"
43 #define GET_SUBTARGETINFO_HEADER
44 #include "R600GenSubtargetInfo.inc"
45 
46 namespace llvm {
47 
48 class StringRef;
49 
50 class AMDGPUSubtarget {
51 public:
52   enum Generation {
53     R600 = 0,
54     R700 = 1,
55     EVERGREEN = 2,
56     NORTHERN_ISLANDS = 3,
57     SOUTHERN_ISLANDS = 4,
58     SEA_ISLANDS = 5,
59     VOLCANIC_ISLANDS = 6,
60     GFX9 = 7,
61     GFX10 = 8
62   };
63 
64 private:
65   Triple TargetTriple;
66 
67 protected:
68   bool Has16BitInsts;
69   bool HasMadMixInsts;
70   bool HasMadMacF32Insts;
71   bool HasDsSrc2Insts;
72   bool HasSDWA;
73   bool HasVOP3PInsts;
74   bool HasMulI24;
75   bool HasMulU24;
76   bool HasInv2PiInlineImm;
77   bool HasFminFmaxLegacy;
78   bool EnablePromoteAlloca;
79   bool HasTrigReducedRange;
80   unsigned MaxWavesPerEU;
81   int LocalMemorySize;
82   char WavefrontSizeLog2;
83 
84 public:
85   AMDGPUSubtarget(const Triple &TT);
86 
87   static const AMDGPUSubtarget &get(const MachineFunction &MF);
88   static const AMDGPUSubtarget &get(const TargetMachine &TM,
89                                     const Function &F);
90 
91   /// \returns Default range flat work group size for a calling convention.
92   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
93 
94   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
95   /// for function \p F, or minimum/maximum flat work group sizes explicitly
96   /// requested using "amdgpu-flat-work-group-size" attribute attached to
97   /// function \p F.
98   ///
99   /// \returns Subtarget's default values if explicitly requested values cannot
100   /// be converted to integer, or violate subtarget's specifications.
101   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
102 
103   /// \returns Subtarget's default pair of minimum/maximum number of waves per
104   /// execution unit for function \p F, or minimum/maximum number of waves per
105   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
106   /// attached to function \p F.
107   ///
108   /// \returns Subtarget's default values if explicitly requested values cannot
109   /// be converted to integer, violate subtarget's specifications, or are not
110   /// compatible with minimum/maximum number of waves limited by flat work group
111   /// size, register usage, and/or lds usage.
112   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
113 
114   /// Return the amount of LDS that can be used that will not restrict the
115   /// occupancy lower than WaveCount.
116   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
117                                            const Function &) const;
118 
119   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
120   /// the given LDS memory size is the only constraint.
121   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
122 
123   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
124 
125   bool isAmdHsaOS() const {
126     return TargetTriple.getOS() == Triple::AMDHSA;
127   }
128 
129   bool isAmdPalOS() const {
130     return TargetTriple.getOS() == Triple::AMDPAL;
131   }
132 
133   bool isMesa3DOS() const {
134     return TargetTriple.getOS() == Triple::Mesa3D;
135   }
136 
137   bool isMesaKernel(const Function &F) const {
138     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
139   }
140 
141   bool isAmdHsaOrMesa(const Function &F) const {
142     return isAmdHsaOS() || isMesaKernel(F);
143   }
144 
145   bool isGCN() const {
146     return TargetTriple.getArch() == Triple::amdgcn;
147   }
148 
149   bool has16BitInsts() const {
150     return Has16BitInsts;
151   }
152 
153   bool hasMadMixInsts() const {
154     return HasMadMixInsts;
155   }
156 
157   bool hasMadMacF32Insts() const {
158     return HasMadMacF32Insts || !isGCN();
159   }
160 
161   bool hasDsSrc2Insts() const {
162     return HasDsSrc2Insts;
163   }
164 
165   bool hasSDWA() const {
166     return HasSDWA;
167   }
168 
169   bool hasVOP3PInsts() const {
170     return HasVOP3PInsts;
171   }
172 
173   bool hasMulI24() const {
174     return HasMulI24;
175   }
176 
177   bool hasMulU24() const {
178     return HasMulU24;
179   }
180 
181   bool hasInv2PiInlineImm() const {
182     return HasInv2PiInlineImm;
183   }
184 
185   bool hasFminFmaxLegacy() const {
186     return HasFminFmaxLegacy;
187   }
188 
189   bool hasTrigReducedRange() const {
190     return HasTrigReducedRange;
191   }
192 
193   bool isPromoteAllocaEnabled() const {
194     return EnablePromoteAlloca;
195   }
196 
197   unsigned getWavefrontSize() const {
198     return 1 << WavefrontSizeLog2;
199   }
200 
201   unsigned getWavefrontSizeLog2() const {
202     return WavefrontSizeLog2;
203   }
204 
205   int getLocalMemorySize() const {
206     return LocalMemorySize;
207   }
208 
209   Align getAlignmentForImplicitArgPtr() const {
210     return isAmdHsaOS() ? Align(8) : Align(4);
211   }
212 
213   /// Returns the offset in bytes from the start of the input buffer
214   ///        of the first explicit kernel argument.
215   unsigned getExplicitKernelArgOffset(const Function &F) const {
216     return isAmdHsaOrMesa(F) ? 0 : 36;
217   }
218 
219   /// \returns Maximum number of work groups per compute unit supported by the
220   /// subtarget and limited by given \p FlatWorkGroupSize.
221   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
222 
223   /// \returns Minimum flat work group size supported by the subtarget.
224   virtual unsigned getMinFlatWorkGroupSize() const = 0;
225 
226   /// \returns Maximum flat work group size supported by the subtarget.
227   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
228 
229   /// \returns Number of waves per execution unit required to support the given
230   /// \p FlatWorkGroupSize.
231   virtual unsigned
232   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
233 
234   /// \returns Minimum number of waves per execution unit supported by the
235   /// subtarget.
236   virtual unsigned getMinWavesPerEU() const = 0;
237 
238   /// \returns Maximum number of waves per execution unit supported by the
239   /// subtarget without any kind of limitation.
240   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
241 
242   /// Creates value range metadata on an workitemid.* inrinsic call or load.
243   bool makeLIDRangeMetadata(Instruction *I) const;
244 
245   /// \returns Number of bytes of arguments that are passed to a shader or
246   /// kernel in addition to the explicit ones declared for the function.
247   unsigned getImplicitArgNumBytes(const Function &F) const {
248     if (isMesaKernel(F))
249       return 16;
250     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
251   }
252   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
253   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
254 
255   /// \returns Corresponsing DWARF register number mapping flavour for the
256   /// \p WavefrontSize.
257   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const {
258     return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
259                                     : AMDGPUDwarfFlavour::Wave64;
260   }
261 
262   virtual ~AMDGPUSubtarget() {}
263 };
264 
265 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
266                      public AMDGPUSubtarget {
267 
268   using AMDGPUSubtarget::getMaxWavesPerEU;
269 
270 public:
271   enum TrapHandlerAbi {
272     TrapHandlerAbiNone = 0,
273     TrapHandlerAbiHsa = 1
274   };
275 
276   enum TrapID {
277     TrapIDHardwareReserved = 0,
278     TrapIDHSADebugTrap = 1,
279     TrapIDLLVMTrap = 2,
280     TrapIDLLVMDebugTrap = 3,
281     TrapIDDebugBreakpoint = 7,
282     TrapIDDebugReserved8 = 8,
283     TrapIDDebugReservedFE = 0xfe,
284     TrapIDDebugReservedFF = 0xff
285   };
286 
287   enum TrapRegValues {
288     LLVMTrapHandlerRegValue = 1
289   };
290 
291 private:
292   /// GlobalISel related APIs.
293   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
294   std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
295   std::unique_ptr<InstructionSelector> InstSelector;
296   std::unique_ptr<LegalizerInfo> Legalizer;
297   std::unique_ptr<RegisterBankInfo> RegBankInfo;
298 
299 protected:
300   // Basic subtarget description.
301   Triple TargetTriple;
302   unsigned Gen;
303   InstrItineraryData InstrItins;
304   int LDSBankCount;
305   unsigned MaxPrivateElementSize;
306 
307   // Possibly statically set by tablegen, but may want to be overridden.
308   bool FastFMAF32;
309   bool FastDenormalF32;
310   bool HalfRate64Ops;
311 
312   // Dynamially set bits that enable features.
313   bool FlatForGlobal;
314   bool AutoWaitcntBeforeBarrier;
315   bool CodeObjectV3;
316   bool UnalignedScratchAccess;
317   bool UnalignedBufferAccess;
318   bool HasApertureRegs;
319   bool EnableXNACK;
320   bool DoesNotSupportXNACK;
321   bool EnableCuMode;
322   bool TrapHandler;
323 
324   // Used as options.
325   bool EnableLoadStoreOpt;
326   bool EnableUnsafeDSOffsetFolding;
327   bool EnableSIScheduler;
328   bool EnableDS128;
329   bool EnablePRTStrictNull;
330   bool DumpCode;
331 
332   // Subtarget statically properties set by tablegen
333   bool FP64;
334   bool FMA;
335   bool MIMG_R128;
336   bool IsGCN;
337   bool GCN3Encoding;
338   bool CIInsts;
339   bool GFX8Insts;
340   bool GFX9Insts;
341   bool GFX10Insts;
342   bool GFX10_3Insts;
343   bool GFX7GFX8GFX9Insts;
344   bool SGPRInitBug;
345   bool HasSMemRealTime;
346   bool HasIntClamp;
347   bool HasFmaMixInsts;
348   bool HasMovrel;
349   bool HasVGPRIndexMode;
350   bool HasScalarStores;
351   bool HasScalarAtomics;
352   bool HasSDWAOmod;
353   bool HasSDWAScalar;
354   bool HasSDWASdst;
355   bool HasSDWAMac;
356   bool HasSDWAOutModsVOPC;
357   bool HasDPP;
358   bool HasDPP8;
359   bool HasR128A16;
360   bool HasGFX10A16;
361   bool HasG16;
362   bool HasNSAEncoding;
363   bool GFX10_BEncoding;
364   bool HasDLInsts;
365   bool HasDot1Insts;
366   bool HasDot2Insts;
367   bool HasDot3Insts;
368   bool HasDot4Insts;
369   bool HasDot5Insts;
370   bool HasDot6Insts;
371   bool HasMAIInsts;
372   bool HasPkFmacF16Inst;
373   bool HasAtomicFaddInsts;
374   bool EnableSRAMECC;
375   bool DoesNotSupportSRAMECC;
376   bool HasNoSdstCMPX;
377   bool HasVscnt;
378   bool HasGetWaveIdInst;
379   bool HasSMemTimeInst;
380   bool HasRegisterBanking;
381   bool HasVOP3Literal;
382   bool HasNoDataDepHazard;
383   bool FlatAddressSpace;
384   bool FlatInstOffsets;
385   bool FlatGlobalInsts;
386   bool FlatScratchInsts;
387   bool ScalarFlatScratchInsts;
388   bool AddNoCarryInsts;
389   bool HasUnpackedD16VMem;
390   bool R600ALUInst;
391   bool CaymanISA;
392   bool CFALUBug;
393   bool LDSMisalignedBug;
394   bool HasMFMAInlineLiteralBug;
395   bool HasVertexCache;
396   short TexVTXClauseSize;
397   bool ScalarizeGlobal;
398 
399   bool HasVcmpxPermlaneHazard;
400   bool HasVMEMtoScalarWriteHazard;
401   bool HasSMEMtoVectorWriteHazard;
402   bool HasInstFwdPrefetchBug;
403   bool HasVcmpxExecWARHazard;
404   bool HasLdsBranchVmemWARHazard;
405   bool HasNSAtoVMEMBug;
406   bool HasOffset3fBug;
407   bool HasFlatSegmentOffsetBug;
408 
409   // Dummy feature to use for assembler in tablegen.
410   bool FeatureDisable;
411 
412   SelectionDAGTargetInfo TSInfo;
413 private:
414   SIInstrInfo InstrInfo;
415   SITargetLowering TLInfo;
416   SIFrameLowering FrameLowering;
417 
418   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
419   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
420 
421 public:
422   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
423                const GCNTargetMachine &TM);
424   ~GCNSubtarget() override;
425 
426   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
427                                                    StringRef GPU, StringRef FS);
428 
429   const SIInstrInfo *getInstrInfo() const override {
430     return &InstrInfo;
431   }
432 
433   const SIFrameLowering *getFrameLowering() const override {
434     return &FrameLowering;
435   }
436 
437   const SITargetLowering *getTargetLowering() const override {
438     return &TLInfo;
439   }
440 
441   const SIRegisterInfo *getRegisterInfo() const override {
442     return &InstrInfo.getRegisterInfo();
443   }
444 
445   const CallLowering *getCallLowering() const override {
446     return CallLoweringInfo.get();
447   }
448 
449   const InlineAsmLowering *getInlineAsmLowering() const override {
450     return InlineAsmLoweringInfo.get();
451   }
452 
453   InstructionSelector *getInstructionSelector() const override {
454     return InstSelector.get();
455   }
456 
457   const LegalizerInfo *getLegalizerInfo() const override {
458     return Legalizer.get();
459   }
460 
461   const RegisterBankInfo *getRegBankInfo() const override {
462     return RegBankInfo.get();
463   }
464 
465   // Nothing implemented, just prevent crashes on use.
466   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
467     return &TSInfo;
468   }
469 
470   const InstrItineraryData *getInstrItineraryData() const override {
471     return &InstrItins;
472   }
473 
474   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
475 
476   Generation getGeneration() const {
477     return (Generation)Gen;
478   }
479 
480   /// Return the number of high bits known to be zero fror a frame index.
481   unsigned getKnownHighZeroBitsForFrameIndex() const {
482     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
483   }
484 
485   int getLDSBankCount() const {
486     return LDSBankCount;
487   }
488 
489   unsigned getMaxPrivateElementSize() const {
490     return MaxPrivateElementSize;
491   }
492 
493   unsigned getConstantBusLimit(unsigned Opcode) const;
494 
495   bool hasIntClamp() const {
496     return HasIntClamp;
497   }
498 
499   bool hasFP64() const {
500     return FP64;
501   }
502 
503   bool hasMIMG_R128() const {
504     return MIMG_R128;
505   }
506 
507   bool hasHWFP64() const {
508     return FP64;
509   }
510 
511   bool hasFastFMAF32() const {
512     return FastFMAF32;
513   }
514 
515   bool hasHalfRate64Ops() const {
516     return HalfRate64Ops;
517   }
518 
519   bool hasAddr64() const {
520     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
521   }
522 
523   // Return true if the target only has the reverse operand versions of VALU
524   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
525   bool hasOnlyRevVALUShifts() const {
526     return getGeneration() >= VOLCANIC_ISLANDS;
527   }
528 
529   bool hasFractBug() const {
530     return getGeneration() == SOUTHERN_ISLANDS;
531   }
532 
533   bool hasBFE() const {
534     return true;
535   }
536 
537   bool hasBFI() const {
538     return true;
539   }
540 
541   bool hasBFM() const {
542     return hasBFE();
543   }
544 
545   bool hasBCNT(unsigned Size) const {
546     return true;
547   }
548 
549   bool hasFFBL() const {
550     return true;
551   }
552 
553   bool hasFFBH() const {
554     return true;
555   }
556 
557   bool hasMed3_16() const {
558     return getGeneration() >= AMDGPUSubtarget::GFX9;
559   }
560 
561   bool hasMin3Max3_16() const {
562     return getGeneration() >= AMDGPUSubtarget::GFX9;
563   }
564 
565   bool hasFmaMixInsts() const {
566     return HasFmaMixInsts;
567   }
568 
569   bool hasCARRY() const {
570     return true;
571   }
572 
573   bool hasFMA() const {
574     return FMA;
575   }
576 
577   bool hasSwap() const {
578     return GFX9Insts;
579   }
580 
581   bool hasScalarPackInsts() const {
582     return GFX9Insts;
583   }
584 
585   bool hasScalarMulHiInsts() const {
586     return GFX9Insts;
587   }
588 
589   TrapHandlerAbi getTrapHandlerAbi() const {
590     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
591   }
592 
593   /// True if the offset field of DS instructions works as expected. On SI, the
594   /// offset uses a 16-bit adder and does not always wrap properly.
595   bool hasUsableDSOffset() const {
596     return getGeneration() >= SEA_ISLANDS;
597   }
598 
599   bool unsafeDSOffsetFoldingEnabled() const {
600     return EnableUnsafeDSOffsetFolding;
601   }
602 
603   /// Condition output from div_scale is usable.
604   bool hasUsableDivScaleConditionOutput() const {
605     return getGeneration() != SOUTHERN_ISLANDS;
606   }
607 
608   /// Extra wait hazard is needed in some cases before
609   /// s_cbranch_vccnz/s_cbranch_vccz.
610   bool hasReadVCCZBug() const {
611     return getGeneration() <= SEA_ISLANDS;
612   }
613 
614   /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
615   bool partialVCCWritesUpdateVCCZ() const {
616     return getGeneration() >= GFX10;
617   }
618 
619   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
620   /// was written by a VALU instruction.
621   bool hasSMRDReadVALUDefHazard() const {
622     return getGeneration() == SOUTHERN_ISLANDS;
623   }
624 
625   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
626   /// SGPR was written by a VALU Instruction.
627   bool hasVMEMReadSGPRVALUDefHazard() const {
628     return getGeneration() >= VOLCANIC_ISLANDS;
629   }
630 
631   bool hasRFEHazards() const {
632     return getGeneration() >= VOLCANIC_ISLANDS;
633   }
634 
635   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
636   unsigned getSetRegWaitStates() const {
637     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
638   }
639 
640   bool dumpCode() const {
641     return DumpCode;
642   }
643 
644   /// Return the amount of LDS that can be used that will not restrict the
645   /// occupancy lower than WaveCount.
646   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
647                                            const Function &) const;
648 
649   bool supportsMinMaxDenormModes() const {
650     return getGeneration() >= AMDGPUSubtarget::GFX9;
651   }
652 
653   /// \returns If target supports S_DENORM_MODE.
654   bool hasDenormModeInst() const {
655     return getGeneration() >= AMDGPUSubtarget::GFX10;
656   }
657 
658   bool useFlatForGlobal() const {
659     return FlatForGlobal;
660   }
661 
662   /// \returns If target supports ds_read/write_b128 and user enables generation
663   /// of ds_read/write_b128.
664   bool useDS128() const {
665     return CIInsts && EnableDS128;
666   }
667 
668   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
669   bool haveRoundOpsF64() const {
670     return CIInsts;
671   }
672 
673   /// \returns If MUBUF instructions always perform range checking, even for
674   /// buffer resources used for private memory access.
675   bool privateMemoryResourceIsRangeChecked() const {
676     return getGeneration() < AMDGPUSubtarget::GFX9;
677   }
678 
679   /// \returns If target requires PRT Struct NULL support (zero result registers
680   /// for sparse texture support).
681   bool usePRTStrictNull() const {
682     return EnablePRTStrictNull;
683   }
684 
685   bool hasAutoWaitcntBeforeBarrier() const {
686     return AutoWaitcntBeforeBarrier;
687   }
688 
689   bool hasCodeObjectV3() const {
690     // FIXME: Need to add code object v3 support for mesa and pal.
691     return isAmdHsaOS() ? CodeObjectV3 : false;
692   }
693 
694   bool hasUnalignedBufferAccess() const {
695     return UnalignedBufferAccess;
696   }
697 
698   bool hasUnalignedScratchAccess() const {
699     return UnalignedScratchAccess;
700   }
701 
702   bool hasApertureRegs() const {
703     return HasApertureRegs;
704   }
705 
706   bool isTrapHandlerEnabled() const {
707     return TrapHandler;
708   }
709 
710   bool isXNACKEnabled() const {
711     return EnableXNACK;
712   }
713 
714   bool isCuModeEnabled() const {
715     return EnableCuMode;
716   }
717 
718   bool hasFlatAddressSpace() const {
719     return FlatAddressSpace;
720   }
721 
722   bool hasFlatScrRegister() const {
723     return hasFlatAddressSpace();
724   }
725 
726   bool hasFlatInstOffsets() const {
727     return FlatInstOffsets;
728   }
729 
730   bool hasFlatGlobalInsts() const {
731     return FlatGlobalInsts;
732   }
733 
734   bool hasFlatScratchInsts() const {
735     return FlatScratchInsts;
736   }
737 
738   bool hasScalarFlatScratchInsts() const {
739     return ScalarFlatScratchInsts;
740   }
741 
742   bool hasGlobalAddTidInsts() const {
743     return GFX10_BEncoding;
744   }
745 
746   bool hasAtomicCSub() const {
747     return GFX10_BEncoding;
748   }
749 
750   bool hasMultiDwordFlatScratchAddressing() const {
751     return getGeneration() >= GFX9;
752   }
753 
754   bool hasFlatSegmentOffsetBug() const {
755     return HasFlatSegmentOffsetBug;
756   }
757 
758   bool hasFlatLgkmVMemCountInOrder() const {
759     return getGeneration() > GFX9;
760   }
761 
762   bool hasD16LoadStore() const {
763     return getGeneration() >= GFX9;
764   }
765 
766   bool d16PreservesUnusedBits() const {
767     return hasD16LoadStore() && !isSRAMECCEnabled();
768   }
769 
770   bool hasD16Images() const {
771     return getGeneration() >= VOLCANIC_ISLANDS;
772   }
773 
774   /// Return if most LDS instructions have an m0 use that require m0 to be
775   /// iniitalized.
776   bool ldsRequiresM0Init() const {
777     return getGeneration() < GFX9;
778   }
779 
780   // True if the hardware rewinds and replays GWS operations if a wave is
781   // preempted.
782   //
783   // If this is false, a GWS operation requires testing if a nack set the
784   // MEM_VIOL bit, and repeating if so.
785   bool hasGWSAutoReplay() const {
786     return getGeneration() >= GFX9;
787   }
788 
789   /// \returns if target has ds_gws_sema_release_all instruction.
790   bool hasGWSSemaReleaseAll() const {
791     return CIInsts;
792   }
793 
794   bool hasAddNoCarry() const {
795     return AddNoCarryInsts;
796   }
797 
798   bool hasUnpackedD16VMem() const {
799     return HasUnpackedD16VMem;
800   }
801 
802   // Covers VS/PS/CS graphics shaders
803   bool isMesaGfxShader(const Function &F) const {
804     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
805   }
806 
807   bool hasMad64_32() const {
808     return getGeneration() >= SEA_ISLANDS;
809   }
810 
811   bool hasSDWAOmod() const {
812     return HasSDWAOmod;
813   }
814 
815   bool hasSDWAScalar() const {
816     return HasSDWAScalar;
817   }
818 
819   bool hasSDWASdst() const {
820     return HasSDWASdst;
821   }
822 
823   bool hasSDWAMac() const {
824     return HasSDWAMac;
825   }
826 
827   bool hasSDWAOutModsVOPC() const {
828     return HasSDWAOutModsVOPC;
829   }
830 
831   bool hasDLInsts() const {
832     return HasDLInsts;
833   }
834 
835   bool hasDot1Insts() const {
836     return HasDot1Insts;
837   }
838 
839   bool hasDot2Insts() const {
840     return HasDot2Insts;
841   }
842 
843   bool hasDot3Insts() const {
844     return HasDot3Insts;
845   }
846 
847   bool hasDot4Insts() const {
848     return HasDot4Insts;
849   }
850 
851   bool hasDot5Insts() const {
852     return HasDot5Insts;
853   }
854 
855   bool hasDot6Insts() const {
856     return HasDot6Insts;
857   }
858 
859   bool hasMAIInsts() const {
860     return HasMAIInsts;
861   }
862 
863   bool hasPkFmacF16Inst() const {
864     return HasPkFmacF16Inst;
865   }
866 
867   bool hasAtomicFaddInsts() const {
868     return HasAtomicFaddInsts;
869   }
870 
871   bool isSRAMECCEnabled() const {
872     return EnableSRAMECC;
873   }
874 
875   bool hasNoSdstCMPX() const {
876     return HasNoSdstCMPX;
877   }
878 
879   bool hasVscnt() const {
880     return HasVscnt;
881   }
882 
883   bool hasGetWaveIdInst() const {
884     return HasGetWaveIdInst;
885   }
886 
887   bool hasSMemTimeInst() const {
888     return HasSMemTimeInst;
889   }
890 
891   bool hasRegisterBanking() const {
892     return HasRegisterBanking;
893   }
894 
895   bool hasVOP3Literal() const {
896     return HasVOP3Literal;
897   }
898 
899   bool hasNoDataDepHazard() const {
900     return HasNoDataDepHazard;
901   }
902 
903   bool vmemWriteNeedsExpWaitcnt() const {
904     return getGeneration() < SEA_ISLANDS;
905   }
906 
907   // Scratch is allocated in 256 dword per wave blocks for the entire
908   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
909   // is 4-byte aligned.
910   //
911   // Only 4-byte alignment is really needed to access anything. Transformations
912   // on the pointer value itself may rely on the alignment / known low bits of
913   // the pointer. Set this to something above the minimum to avoid needing
914   // dynamic realignment in common cases.
915   Align getStackAlignment() const { return Align(16); }
916 
917   bool enableMachineScheduler() const override {
918     return true;
919   }
920 
921   bool enableSubRegLiveness() const override {
922     return true;
923   }
924 
925   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
926   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
927 
928   // static wrappers
929   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
930 
931   // XXX - Why is this here if it isn't in the default pass set?
932   bool enableEarlyIfConversion() const override {
933     return true;
934   }
935 
936   void overrideSchedPolicy(MachineSchedPolicy &Policy,
937                            unsigned NumRegionInstrs) const override;
938 
939   unsigned getMaxNumUserSGPRs() const {
940     return 16;
941   }
942 
943   bool hasSMemRealTime() const {
944     return HasSMemRealTime;
945   }
946 
947   bool hasMovrel() const {
948     return HasMovrel;
949   }
950 
951   bool hasVGPRIndexMode() const {
952     return HasVGPRIndexMode;
953   }
954 
955   bool useVGPRIndexMode() const;
956 
957   bool hasScalarCompareEq64() const {
958     return getGeneration() >= VOLCANIC_ISLANDS;
959   }
960 
961   bool hasScalarStores() const {
962     return HasScalarStores;
963   }
964 
965   bool hasScalarAtomics() const {
966     return HasScalarAtomics;
967   }
968 
969   bool hasLDSFPAtomics() const {
970     return GFX8Insts;
971   }
972 
973   bool hasDPP() const {
974     return HasDPP;
975   }
976 
977   bool hasDPPBroadcasts() const {
978     return HasDPP && getGeneration() < GFX10;
979   }
980 
981   bool hasDPPWavefrontShifts() const {
982     return HasDPP && getGeneration() < GFX10;
983   }
984 
985   bool hasDPP8() const {
986     return HasDPP8;
987   }
988 
989   bool hasR128A16() const {
990     return HasR128A16;
991   }
992 
993   bool hasGFX10A16() const {
994     return HasGFX10A16;
995   }
996 
997   bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
998 
999   bool hasG16() const { return HasG16; }
1000 
1001   bool hasOffset3fBug() const {
1002     return HasOffset3fBug;
1003   }
1004 
1005   bool hasNSAEncoding() const {
1006     return HasNSAEncoding;
1007   }
1008 
1009   bool hasGFX10_BEncoding() const {
1010     return GFX10_BEncoding;
1011   }
1012 
1013   bool hasGFX10_3Insts() const {
1014     return GFX10_3Insts;
1015   }
1016 
1017   bool hasMadF16() const;
1018 
1019   bool enableSIScheduler() const {
1020     return EnableSIScheduler;
1021   }
1022 
1023   bool loadStoreOptEnabled() const {
1024     return EnableLoadStoreOpt;
1025   }
1026 
1027   bool hasSGPRInitBug() const {
1028     return SGPRInitBug;
1029   }
1030 
1031   bool hasMFMAInlineLiteralBug() const {
1032     return HasMFMAInlineLiteralBug;
1033   }
1034 
1035   bool has12DWordStoreHazard() const {
1036     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1037   }
1038 
1039   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1040   bool hasDwordx3LoadStores() const {
1041     return CIInsts;
1042   }
1043 
1044   bool hasSMovFedHazard() const {
1045     return getGeneration() == AMDGPUSubtarget::GFX9;
1046   }
1047 
1048   bool hasReadM0MovRelInterpHazard() const {
1049     return getGeneration() == AMDGPUSubtarget::GFX9;
1050   }
1051 
1052   bool hasReadM0SendMsgHazard() const {
1053     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1054            getGeneration() <= AMDGPUSubtarget::GFX9;
1055   }
1056 
1057   bool hasVcmpxPermlaneHazard() const {
1058     return HasVcmpxPermlaneHazard;
1059   }
1060 
1061   bool hasVMEMtoScalarWriteHazard() const {
1062     return HasVMEMtoScalarWriteHazard;
1063   }
1064 
1065   bool hasSMEMtoVectorWriteHazard() const {
1066     return HasSMEMtoVectorWriteHazard;
1067   }
1068 
1069   bool hasLDSMisalignedBug() const {
1070     return LDSMisalignedBug && !EnableCuMode;
1071   }
1072 
1073   bool hasInstFwdPrefetchBug() const {
1074     return HasInstFwdPrefetchBug;
1075   }
1076 
1077   bool hasVcmpxExecWARHazard() const {
1078     return HasVcmpxExecWARHazard;
1079   }
1080 
1081   bool hasLdsBranchVmemWARHazard() const {
1082     return HasLdsBranchVmemWARHazard;
1083   }
1084 
1085   bool hasNSAtoVMEMBug() const {
1086     return HasNSAtoVMEMBug;
1087   }
1088 
1089   bool hasHardClauses() const { return getGeneration() >= GFX10; }
1090 
1091   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1092   /// SGPRs
1093   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1094 
1095   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1096   /// VGPRs
1097   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1098 
1099   /// Return occupancy for the given function. Used LDS and a number of
1100   /// registers if provided.
1101   /// Note, occupancy can be affected by the scratch allocation as well, but
1102   /// we do not have enough information to compute it.
1103   unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
1104                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1105 
1106   /// \returns true if the flat_scratch register should be initialized with the
1107   /// pointer to the wave's scratch memory rather than a size and offset.
1108   bool flatScratchIsPointer() const {
1109     return getGeneration() >= AMDGPUSubtarget::GFX9;
1110   }
1111 
1112   /// \returns true if the machine has merged shaders in which s0-s7 are
1113   /// reserved by the hardware and user SGPRs start at s8
1114   bool hasMergedShaders() const {
1115     return getGeneration() >= GFX9;
1116   }
1117 
1118   /// \returns SGPR allocation granularity supported by the subtarget.
1119   unsigned getSGPRAllocGranule() const {
1120     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1121   }
1122 
1123   /// \returns SGPR encoding granularity supported by the subtarget.
1124   unsigned getSGPREncodingGranule() const {
1125     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1126   }
1127 
1128   /// \returns Total number of SGPRs supported by the subtarget.
1129   unsigned getTotalNumSGPRs() const {
1130     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1131   }
1132 
1133   /// \returns Addressable number of SGPRs supported by the subtarget.
1134   unsigned getAddressableNumSGPRs() const {
1135     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1136   }
1137 
1138   /// \returns Minimum number of SGPRs that meets the given number of waves per
1139   /// execution unit requirement supported by the subtarget.
1140   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1141     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1142   }
1143 
1144   /// \returns Maximum number of SGPRs that meets the given number of waves per
1145   /// execution unit requirement supported by the subtarget.
1146   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1147     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1148   }
1149 
1150   /// \returns Reserved number of SGPRs for given function \p MF.
1151   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1152 
1153   /// \returns Maximum number of SGPRs that meets number of waves per execution
1154   /// unit requirement for function \p MF, or number of SGPRs explicitly
1155   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1156   ///
1157   /// \returns Value that meets number of waves per execution unit requirement
1158   /// if explicitly requested value cannot be converted to integer, violates
1159   /// subtarget's specifications, or does not meet number of waves per execution
1160   /// unit requirement.
1161   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1162 
1163   /// \returns VGPR allocation granularity supported by the subtarget.
1164   unsigned getVGPRAllocGranule() const {
1165     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1166   }
1167 
1168   /// \returns VGPR encoding granularity supported by the subtarget.
1169   unsigned getVGPREncodingGranule() const {
1170     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1171   }
1172 
1173   /// \returns Total number of VGPRs supported by the subtarget.
1174   unsigned getTotalNumVGPRs() const {
1175     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1176   }
1177 
1178   /// \returns Addressable number of VGPRs supported by the subtarget.
1179   unsigned getAddressableNumVGPRs() const {
1180     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1181   }
1182 
1183   /// \returns Minimum number of VGPRs that meets given number of waves per
1184   /// execution unit requirement supported by the subtarget.
1185   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1186     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1187   }
1188 
1189   /// \returns Maximum number of VGPRs that meets given number of waves per
1190   /// execution unit requirement supported by the subtarget.
1191   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1192     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1193   }
1194 
1195   /// \returns Maximum number of VGPRs that meets number of waves per execution
1196   /// unit requirement for function \p MF, or number of VGPRs explicitly
1197   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1198   ///
1199   /// \returns Value that meets number of waves per execution unit requirement
1200   /// if explicitly requested value cannot be converted to integer, violates
1201   /// subtarget's specifications, or does not meet number of waves per execution
1202   /// unit requirement.
1203   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1204 
1205   void getPostRAMutations(
1206       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1207       const override;
1208 
1209   bool isWave32() const {
1210     return getWavefrontSize() == 32;
1211   }
1212 
1213   const TargetRegisterClass *getBoolRC() const {
1214     return getRegisterInfo()->getBoolRC();
1215   }
1216 
1217   /// \returns Maximum number of work groups per compute unit supported by the
1218   /// subtarget and limited by given \p FlatWorkGroupSize.
1219   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1220     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1221   }
1222 
1223   /// \returns Minimum flat work group size supported by the subtarget.
1224   unsigned getMinFlatWorkGroupSize() const override {
1225     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1226   }
1227 
1228   /// \returns Maximum flat work group size supported by the subtarget.
1229   unsigned getMaxFlatWorkGroupSize() const override {
1230     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1231   }
1232 
1233   /// \returns Number of waves per execution unit required to support the given
1234   /// \p FlatWorkGroupSize.
1235   unsigned
1236   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1237     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1238   }
1239 
1240   /// \returns Minimum number of waves per execution unit supported by the
1241   /// subtarget.
1242   unsigned getMinWavesPerEU() const override {
1243     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1244   }
1245 
1246   void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
1247                              SDep &Dep) const override;
1248 };
1249 
1250 class R600Subtarget final : public R600GenSubtargetInfo,
1251                             public AMDGPUSubtarget {
1252 private:
1253   R600InstrInfo InstrInfo;
1254   R600FrameLowering FrameLowering;
1255   bool FMA;
1256   bool CaymanISA;
1257   bool CFALUBug;
1258   bool HasVertexCache;
1259   bool R600ALUInst;
1260   bool FP64;
1261   short TexVTXClauseSize;
1262   Generation Gen;
1263   R600TargetLowering TLInfo;
1264   InstrItineraryData InstrItins;
1265   SelectionDAGTargetInfo TSInfo;
1266 
1267 public:
1268   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
1269                 const TargetMachine &TM);
1270 
1271   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
1272 
1273   const R600FrameLowering *getFrameLowering() const override {
1274     return &FrameLowering;
1275   }
1276 
1277   const R600TargetLowering *getTargetLowering() const override {
1278     return &TLInfo;
1279   }
1280 
1281   const R600RegisterInfo *getRegisterInfo() const override {
1282     return &InstrInfo.getRegisterInfo();
1283   }
1284 
1285   const InstrItineraryData *getInstrItineraryData() const override {
1286     return &InstrItins;
1287   }
1288 
1289   // Nothing implemented, just prevent crashes on use.
1290   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1291     return &TSInfo;
1292   }
1293 
1294   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1295 
1296   Generation getGeneration() const {
1297     return Gen;
1298   }
1299 
1300   Align getStackAlignment() const { return Align(4); }
1301 
1302   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1303                                                  StringRef GPU, StringRef FS);
1304 
1305   bool hasBFE() const {
1306     return (getGeneration() >= EVERGREEN);
1307   }
1308 
1309   bool hasBFI() const {
1310     return (getGeneration() >= EVERGREEN);
1311   }
1312 
1313   bool hasBCNT(unsigned Size) const {
1314     if (Size == 32)
1315       return (getGeneration() >= EVERGREEN);
1316 
1317     return false;
1318   }
1319 
1320   bool hasBORROW() const {
1321     return (getGeneration() >= EVERGREEN);
1322   }
1323 
1324   bool hasCARRY() const {
1325     return (getGeneration() >= EVERGREEN);
1326   }
1327 
1328   bool hasCaymanISA() const {
1329     return CaymanISA;
1330   }
1331 
1332   bool hasFFBL() const {
1333     return (getGeneration() >= EVERGREEN);
1334   }
1335 
1336   bool hasFFBH() const {
1337     return (getGeneration() >= EVERGREEN);
1338   }
1339 
1340   bool hasFMA() const { return FMA; }
1341 
1342   bool hasCFAluBug() const { return CFALUBug; }
1343 
1344   bool hasVertexCache() const { return HasVertexCache; }
1345 
1346   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1347 
1348   bool enableMachineScheduler() const override {
1349     return true;
1350   }
1351 
1352   bool enableSubRegLiveness() const override {
1353     return true;
1354   }
1355 
1356   /// \returns Maximum number of work groups per compute unit supported by the
1357   /// subtarget and limited by given \p FlatWorkGroupSize.
1358   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1359     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1360   }
1361 
1362   /// \returns Minimum flat work group size supported by the subtarget.
1363   unsigned getMinFlatWorkGroupSize() const override {
1364     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1365   }
1366 
1367   /// \returns Maximum flat work group size supported by the subtarget.
1368   unsigned getMaxFlatWorkGroupSize() const override {
1369     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1370   }
1371 
1372   /// \returns Number of waves per execution unit required to support the given
1373   /// \p FlatWorkGroupSize.
1374   unsigned
1375   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
1376     return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
1377   }
1378 
1379   /// \returns Minimum number of waves per execution unit supported by the
1380   /// subtarget.
1381   unsigned getMinWavesPerEU() const override {
1382     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1383   }
1384 };
1385 
1386 } // end namespace llvm
1387 
1388 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
1389