1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU ------*- C++ -*-====//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "AMDGPU.h"
18 #include "AMDGPUCallLowering.h"
19 #include "R600FrameLowering.h"
20 #include "R600ISelLowering.h"
21 #include "R600InstrInfo.h"
22 #include "SIFrameLowering.h"
23 #include "SIISelLowering.h"
24 #include "SIInstrInfo.h"
25 #include "Utils/AMDGPUBaseInfo.h"
26 #include "llvm/ADT/Triple.h"
27 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
28 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
29 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
30 #include "llvm/CodeGen/MachineFunction.h"
31 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
32 #include "llvm/MC/MCInstrItineraries.h"
33 #include "llvm/Support/MathExtras.h"
34 #include <cassert>
35 #include <cstdint>
36 #include <memory>
37 #include <utility>
38 
39 #define GET_SUBTARGETINFO_HEADER
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_HEADER
42 #include "R600GenSubtargetInfo.inc"
43 
44 namespace llvm {
45 
46 class StringRef;
47 
48 class AMDGPUSubtarget {
49 public:
50   enum Generation {
51     R600 = 0,
52     R700 = 1,
53     EVERGREEN = 2,
54     NORTHERN_ISLANDS = 3,
55     SOUTHERN_ISLANDS = 4,
56     SEA_ISLANDS = 5,
57     VOLCANIC_ISLANDS = 6,
58     GFX9 = 7,
59     GFX10 = 8
60   };
61 
62 private:
63   Triple TargetTriple;
64 
65 protected:
66   bool Has16BitInsts;
67   bool HasMadMixInsts;
68   bool FP32Denormals;
69   bool FPExceptions;
70   bool HasSDWA;
71   bool HasVOP3PInsts;
72   bool HasMulI24;
73   bool HasMulU24;
74   bool HasInv2PiInlineImm;
75   bool HasFminFmaxLegacy;
76   bool EnablePromoteAlloca;
77   bool HasTrigReducedRange;
78   unsigned MaxWavesPerEU;
79   int LocalMemorySize;
80   unsigned WavefrontSize;
81 
82 public:
83   AMDGPUSubtarget(const Triple &TT);
84 
85   static const AMDGPUSubtarget &get(const MachineFunction &MF);
86   static const AMDGPUSubtarget &get(const TargetMachine &TM,
87                                     const Function &F);
88 
89   /// \returns Default range flat work group size for a calling convention.
90   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
91 
92   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
93   /// for function \p F, or minimum/maximum flat work group sizes explicitly
94   /// requested using "amdgpu-flat-work-group-size" attribute attached to
95   /// function \p F.
96   ///
97   /// \returns Subtarget's default values if explicitly requested values cannot
98   /// be converted to integer, or violate subtarget's specifications.
99   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
100 
101   /// \returns Subtarget's default pair of minimum/maximum number of waves per
102   /// execution unit for function \p F, or minimum/maximum number of waves per
103   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
104   /// attached to function \p F.
105   ///
106   /// \returns Subtarget's default values if explicitly requested values cannot
107   /// be converted to integer, violate subtarget's specifications, or are not
108   /// compatible with minimum/maximum number of waves limited by flat work group
109   /// size, register usage, and/or lds usage.
110   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
111 
112   /// Return the amount of LDS that can be used that will not restrict the
113   /// occupancy lower than WaveCount.
114   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
115                                            const Function &) const;
116 
117   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
118   /// the given LDS memory size is the only constraint.
119   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
120 
121   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
122 
123   bool isAmdHsaOS() const {
124     return TargetTriple.getOS() == Triple::AMDHSA;
125   }
126 
127   bool isAmdPalOS() const {
128     return TargetTriple.getOS() == Triple::AMDPAL;
129   }
130 
131   bool isMesa3DOS() const {
132     return TargetTriple.getOS() == Triple::Mesa3D;
133   }
134 
135   bool isMesaKernel(const Function &F) const {
136     return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
137   }
138 
139   bool isAmdHsaOrMesa(const Function &F) const {
140     return isAmdHsaOS() || isMesaKernel(F);
141   }
142 
143   bool has16BitInsts() const {
144     return Has16BitInsts;
145   }
146 
147   bool hasMadMixInsts() const {
148     return HasMadMixInsts;
149   }
150 
151   bool hasFP32Denormals(const Function &F) const {
152     // FIXME: This should not be a property of the subtarget. This should be a
153     // property with a default set by the calling convention which can be
154     // overridden by attributes. For now, use the subtarget feature as a
155     // placeholder attribute. The function arguments only purpose is to
156     // discourage use without a function context until this is removed.
157     return FP32Denormals;
158   }
159 
160   bool hasFPExceptions() const {
161     return FPExceptions;
162   }
163 
164   bool hasSDWA() const {
165     return HasSDWA;
166   }
167 
168   bool hasVOP3PInsts() const {
169     return HasVOP3PInsts;
170   }
171 
172   bool hasMulI24() const {
173     return HasMulI24;
174   }
175 
176   bool hasMulU24() const {
177     return HasMulU24;
178   }
179 
180   bool hasInv2PiInlineImm() const {
181     return HasInv2PiInlineImm;
182   }
183 
184   bool hasFminFmaxLegacy() const {
185     return HasFminFmaxLegacy;
186   }
187 
188   bool hasTrigReducedRange() const {
189     return HasTrigReducedRange;
190   }
191 
192   bool isPromoteAllocaEnabled() const {
193     return EnablePromoteAlloca;
194   }
195 
196   unsigned getWavefrontSize() const {
197     return WavefrontSize;
198   }
199 
200   int getLocalMemorySize() const {
201     return LocalMemorySize;
202   }
203 
204   Align getAlignmentForImplicitArgPtr() const {
205     return isAmdHsaOS() ? Align(8) : Align(4);
206   }
207 
208   /// Returns the offset in bytes from the start of the input buffer
209   ///        of the first explicit kernel argument.
210   unsigned getExplicitKernelArgOffset(const Function &F) const {
211     return isAmdHsaOrMesa(F) ? 0 : 36;
212   }
213 
214   /// \returns Maximum number of work groups per compute unit supported by the
215   /// subtarget and limited by given \p FlatWorkGroupSize.
216   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
217 
218   /// \returns Minimum flat work group size supported by the subtarget.
219   virtual unsigned getMinFlatWorkGroupSize() const = 0;
220 
221   /// \returns Maximum flat work group size supported by the subtarget.
222   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
223 
224   /// \returns Maximum number of waves per execution unit supported by the
225   /// subtarget and limited by given \p FlatWorkGroupSize.
226   virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
227 
228   /// \returns Minimum number of waves per execution unit supported by the
229   /// subtarget.
230   virtual unsigned getMinWavesPerEU() const = 0;
231 
232   /// \returns Maximum number of waves per execution unit supported by the
233   /// subtarget without any kind of limitation.
234   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
235 
236   /// Creates value range metadata on an workitemid.* inrinsic call or load.
237   bool makeLIDRangeMetadata(Instruction *I) const;
238 
239   /// \returns Number of bytes of arguments that are passed to a shader or
240   /// kernel in addition to the explicit ones declared for the function.
241   unsigned getImplicitArgNumBytes(const Function &F) const {
242     if (isMesaKernel(F))
243       return 16;
244     return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
245   }
246   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
247   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
248 
249   virtual ~AMDGPUSubtarget() {}
250 };
251 
252 class GCNSubtarget : public AMDGPUGenSubtargetInfo,
253                      public AMDGPUSubtarget {
254 
255   using AMDGPUSubtarget::getMaxWavesPerEU;
256 
257 public:
258   enum TrapHandlerAbi {
259     TrapHandlerAbiNone = 0,
260     TrapHandlerAbiHsa = 1
261   };
262 
263   enum TrapID {
264     TrapIDHardwareReserved = 0,
265     TrapIDHSADebugTrap = 1,
266     TrapIDLLVMTrap = 2,
267     TrapIDLLVMDebugTrap = 3,
268     TrapIDDebugBreakpoint = 7,
269     TrapIDDebugReserved8 = 8,
270     TrapIDDebugReservedFE = 0xfe,
271     TrapIDDebugReservedFF = 0xff
272   };
273 
274   enum TrapRegValues {
275     LLVMTrapHandlerRegValue = 1
276   };
277 
278 private:
279   /// GlobalISel related APIs.
280   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
281   std::unique_ptr<InstructionSelector> InstSelector;
282   std::unique_ptr<LegalizerInfo> Legalizer;
283   std::unique_ptr<RegisterBankInfo> RegBankInfo;
284 
285 protected:
286   // Basic subtarget description.
287   Triple TargetTriple;
288   unsigned Gen;
289   InstrItineraryData InstrItins;
290   int LDSBankCount;
291   unsigned MaxPrivateElementSize;
292 
293   // Possibly statically set by tablegen, but may want to be overridden.
294   bool FastFMAF32;
295   bool HalfRate64Ops;
296 
297   // Dynamially set bits that enable features.
298   bool FP64FP16Denormals;
299   bool FlatForGlobal;
300   bool AutoWaitcntBeforeBarrier;
301   bool CodeObjectV3;
302   bool UnalignedScratchAccess;
303   bool UnalignedBufferAccess;
304   bool HasApertureRegs;
305   bool EnableXNACK;
306   bool DoesNotSupportXNACK;
307   bool EnableCuMode;
308   bool TrapHandler;
309 
310   // Used as options.
311   bool EnableLoadStoreOpt;
312   bool EnableUnsafeDSOffsetFolding;
313   bool EnableSIScheduler;
314   bool EnableDS128;
315   bool EnablePRTStrictNull;
316   bool DumpCode;
317 
318   // Subtarget statically properties set by tablegen
319   bool FP64;
320   bool FMA;
321   bool MIMG_R128;
322   bool IsGCN;
323   bool GCN3Encoding;
324   bool CIInsts;
325   bool GFX8Insts;
326   bool GFX9Insts;
327   bool GFX10Insts;
328   bool GFX7GFX8GFX9Insts;
329   bool SGPRInitBug;
330   bool HasSMemRealTime;
331   bool HasIntClamp;
332   bool HasFmaMixInsts;
333   bool HasMovrel;
334   bool HasVGPRIndexMode;
335   bool HasScalarStores;
336   bool HasScalarAtomics;
337   bool HasSDWAOmod;
338   bool HasSDWAScalar;
339   bool HasSDWASdst;
340   bool HasSDWAMac;
341   bool HasSDWAOutModsVOPC;
342   bool HasDPP;
343   bool HasDPP8;
344   bool HasR128A16;
345   bool HasNSAEncoding;
346   bool HasDLInsts;
347   bool HasDot1Insts;
348   bool HasDot2Insts;
349   bool HasDot3Insts;
350   bool HasDot4Insts;
351   bool HasDot5Insts;
352   bool HasDot6Insts;
353   bool HasMAIInsts;
354   bool HasPkFmacF16Inst;
355   bool HasAtomicFaddInsts;
356   bool EnableSRAMECC;
357   bool DoesNotSupportSRAMECC;
358   bool HasNoSdstCMPX;
359   bool HasVscnt;
360   bool HasRegisterBanking;
361   bool HasVOP3Literal;
362   bool HasNoDataDepHazard;
363   bool FlatAddressSpace;
364   bool FlatInstOffsets;
365   bool FlatGlobalInsts;
366   bool FlatScratchInsts;
367   bool ScalarFlatScratchInsts;
368   bool AddNoCarryInsts;
369   bool HasUnpackedD16VMem;
370   bool R600ALUInst;
371   bool CaymanISA;
372   bool CFALUBug;
373   bool LDSMisalignedBug;
374   bool HasMFMAInlineLiteralBug;
375   bool HasVertexCache;
376   short TexVTXClauseSize;
377   bool ScalarizeGlobal;
378 
379   bool HasVcmpxPermlaneHazard;
380   bool HasVMEMtoScalarWriteHazard;
381   bool HasSMEMtoVectorWriteHazard;
382   bool HasInstFwdPrefetchBug;
383   bool HasVcmpxExecWARHazard;
384   bool HasLdsBranchVmemWARHazard;
385   bool HasNSAtoVMEMBug;
386   bool HasOffset3fBug;
387   bool HasFlatSegmentOffsetBug;
388 
389   // Dummy feature to use for assembler in tablegen.
390   bool FeatureDisable;
391 
392   SelectionDAGTargetInfo TSInfo;
393 private:
394   SIInstrInfo InstrInfo;
395   SITargetLowering TLInfo;
396   SIFrameLowering FrameLowering;
397 
398   // See COMPUTE_TMPRING_SIZE.WAVESIZE, 13-bit field in units of 256-dword.
399   static const unsigned MaxWaveScratchSize = (256 * 4) * ((1 << 13) - 1);
400 
401 public:
402   GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
403                const GCNTargetMachine &TM);
404   ~GCNSubtarget() override;
405 
406   GCNSubtarget &initializeSubtargetDependencies(const Triple &TT,
407                                                    StringRef GPU, StringRef FS);
408 
409   const SIInstrInfo *getInstrInfo() const override {
410     return &InstrInfo;
411   }
412 
413   const SIFrameLowering *getFrameLowering() const override {
414     return &FrameLowering;
415   }
416 
417   const SITargetLowering *getTargetLowering() const override {
418     return &TLInfo;
419   }
420 
421   const SIRegisterInfo *getRegisterInfo() const override {
422     return &InstrInfo.getRegisterInfo();
423   }
424 
425   const CallLowering *getCallLowering() const override {
426     return CallLoweringInfo.get();
427   }
428 
429   InstructionSelector *getInstructionSelector() const override {
430     return InstSelector.get();
431   }
432 
433   const LegalizerInfo *getLegalizerInfo() const override {
434     return Legalizer.get();
435   }
436 
437   const RegisterBankInfo *getRegBankInfo() const override {
438     return RegBankInfo.get();
439   }
440 
441   // Nothing implemented, just prevent crashes on use.
442   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
443     return &TSInfo;
444   }
445 
446   const InstrItineraryData *getInstrItineraryData() const override {
447     return &InstrItins;
448   }
449 
450   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
451 
452   Generation getGeneration() const {
453     return (Generation)Gen;
454   }
455 
456   unsigned getWavefrontSizeLog2() const {
457     return Log2_32(WavefrontSize);
458   }
459 
460   /// Return the number of high bits known to be zero fror a frame index.
461   unsigned getKnownHighZeroBitsForFrameIndex() const {
462     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
463   }
464 
465   int getLDSBankCount() const {
466     return LDSBankCount;
467   }
468 
469   unsigned getMaxPrivateElementSize() const {
470     return MaxPrivateElementSize;
471   }
472 
473   unsigned getConstantBusLimit(unsigned Opcode) const;
474 
475   bool hasIntClamp() const {
476     return HasIntClamp;
477   }
478 
479   bool hasFP64() const {
480     return FP64;
481   }
482 
483   bool hasMIMG_R128() const {
484     return MIMG_R128;
485   }
486 
487   bool hasHWFP64() const {
488     return FP64;
489   }
490 
491   bool hasFastFMAF32() const {
492     return FastFMAF32;
493   }
494 
495   bool hasHalfRate64Ops() const {
496     return HalfRate64Ops;
497   }
498 
499   bool hasAddr64() const {
500     return (getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS);
501   }
502 
503   // Return true if the target only has the reverse operand versions of VALU
504   // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
505   bool hasOnlyRevVALUShifts() const {
506     return getGeneration() >= VOLCANIC_ISLANDS;
507   }
508 
509   bool hasBFE() const {
510     return true;
511   }
512 
513   bool hasBFI() const {
514     return true;
515   }
516 
517   bool hasBFM() const {
518     return hasBFE();
519   }
520 
521   bool hasBCNT(unsigned Size) const {
522     return true;
523   }
524 
525   bool hasFFBL() const {
526     return true;
527   }
528 
529   bool hasFFBH() const {
530     return true;
531   }
532 
533   bool hasMed3_16() const {
534     return getGeneration() >= AMDGPUSubtarget::GFX9;
535   }
536 
537   bool hasMin3Max3_16() const {
538     return getGeneration() >= AMDGPUSubtarget::GFX9;
539   }
540 
541   bool hasFmaMixInsts() const {
542     return HasFmaMixInsts;
543   }
544 
545   bool hasCARRY() const {
546     return true;
547   }
548 
549   bool hasFMA() const {
550     return FMA;
551   }
552 
553   bool hasSwap() const {
554     return GFX9Insts;
555   }
556 
557   bool hasScalarPackInsts() const {
558     return GFX9Insts;
559   }
560 
561   bool hasScalarMulHiInsts() const {
562     return GFX9Insts;
563   }
564 
565   TrapHandlerAbi getTrapHandlerAbi() const {
566     return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
567   }
568 
569   /// True if the offset field of DS instructions works as expected. On SI, the
570   /// offset uses a 16-bit adder and does not always wrap properly.
571   bool hasUsableDSOffset() const {
572     return getGeneration() >= SEA_ISLANDS;
573   }
574 
575   bool unsafeDSOffsetFoldingEnabled() const {
576     return EnableUnsafeDSOffsetFolding;
577   }
578 
579   /// Condition output from div_scale is usable.
580   bool hasUsableDivScaleConditionOutput() const {
581     return getGeneration() != SOUTHERN_ISLANDS;
582   }
583 
584   /// Extra wait hazard is needed in some cases before
585   /// s_cbranch_vccnz/s_cbranch_vccz.
586   bool hasReadVCCZBug() const {
587     return getGeneration() <= SEA_ISLANDS;
588   }
589 
590   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
591   /// was written by a VALU instruction.
592   bool hasSMRDReadVALUDefHazard() const {
593     return getGeneration() == SOUTHERN_ISLANDS;
594   }
595 
596   /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
597   /// SGPR was written by a VALU Instruction.
598   bool hasVMEMReadSGPRVALUDefHazard() const {
599     return getGeneration() >= VOLCANIC_ISLANDS;
600   }
601 
602   bool hasRFEHazards() const {
603     return getGeneration() >= VOLCANIC_ISLANDS;
604   }
605 
606   /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
607   unsigned getSetRegWaitStates() const {
608     return getGeneration() <= SEA_ISLANDS ? 1 : 2;
609   }
610 
611   bool dumpCode() const {
612     return DumpCode;
613   }
614 
615   /// Return the amount of LDS that can be used that will not restrict the
616   /// occupancy lower than WaveCount.
617   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
618                                            const Function &) const;
619 
620   /// Alias for hasFP64FP16Denormals
621   bool hasFP16Denormals(const Function &F) const {
622     return FP64FP16Denormals;
623   }
624 
625   /// Alias for hasFP64FP16Denormals
626   bool hasFP64Denormals(const Function &F) const {
627     return FP64FP16Denormals;
628   }
629 
630   bool hasFP64FP16Denormals(const Function &F) const {
631     return FP64FP16Denormals;
632   }
633 
634   bool supportsMinMaxDenormModes() const {
635     return getGeneration() >= AMDGPUSubtarget::GFX9;
636   }
637 
638   /// \returns If target supports S_DENORM_MODE.
639   bool hasDenormModeInst() const {
640     return getGeneration() >= AMDGPUSubtarget::GFX10;
641   }
642 
643   bool useFlatForGlobal() const {
644     return FlatForGlobal;
645   }
646 
647   /// \returns If target supports ds_read/write_b128 and user enables generation
648   /// of ds_read/write_b128.
649   bool useDS128() const {
650     return CIInsts && EnableDS128;
651   }
652 
653   /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
654   bool haveRoundOpsF64() const {
655     return CIInsts;
656   }
657 
658   /// \returns If MUBUF instructions always perform range checking, even for
659   /// buffer resources used for private memory access.
660   bool privateMemoryResourceIsRangeChecked() const {
661     return getGeneration() < AMDGPUSubtarget::GFX9;
662   }
663 
664   /// \returns If target requires PRT Struct NULL support (zero result registers
665   /// for sparse texture support).
666   bool usePRTStrictNull() const {
667     return EnablePRTStrictNull;
668   }
669 
670   bool hasAutoWaitcntBeforeBarrier() const {
671     return AutoWaitcntBeforeBarrier;
672   }
673 
674   bool hasCodeObjectV3() const {
675     // FIXME: Need to add code object v3 support for mesa and pal.
676     return isAmdHsaOS() ? CodeObjectV3 : false;
677   }
678 
679   bool hasUnalignedBufferAccess() const {
680     return UnalignedBufferAccess;
681   }
682 
683   bool hasUnalignedScratchAccess() const {
684     return UnalignedScratchAccess;
685   }
686 
687   bool hasApertureRegs() const {
688     return HasApertureRegs;
689   }
690 
691   bool isTrapHandlerEnabled() const {
692     return TrapHandler;
693   }
694 
695   bool isXNACKEnabled() const {
696     return EnableXNACK;
697   }
698 
699   bool isCuModeEnabled() const {
700     return EnableCuMode;
701   }
702 
703   bool hasFlatAddressSpace() const {
704     return FlatAddressSpace;
705   }
706 
707   bool hasFlatScrRegister() const {
708     return hasFlatAddressSpace();
709   }
710 
711   bool hasFlatInstOffsets() const {
712     return FlatInstOffsets;
713   }
714 
715   bool hasFlatGlobalInsts() const {
716     return FlatGlobalInsts;
717   }
718 
719   bool hasFlatScratchInsts() const {
720     return FlatScratchInsts;
721   }
722 
723   bool hasScalarFlatScratchInsts() const {
724     return ScalarFlatScratchInsts;
725   }
726 
727   bool hasFlatSegmentOffsetBug() const {
728     return HasFlatSegmentOffsetBug;
729   }
730 
731   bool hasFlatLgkmVMemCountInOrder() const {
732     return getGeneration() > GFX9;
733   }
734 
735   bool hasD16LoadStore() const {
736     return getGeneration() >= GFX9;
737   }
738 
739   bool d16PreservesUnusedBits() const {
740     return hasD16LoadStore() && !isSRAMECCEnabled();
741   }
742 
743   bool hasD16Images() const {
744     return getGeneration() >= VOLCANIC_ISLANDS;
745   }
746 
747   /// Return if most LDS instructions have an m0 use that require m0 to be
748   /// iniitalized.
749   bool ldsRequiresM0Init() const {
750     return getGeneration() < GFX9;
751   }
752 
753   // True if the hardware rewinds and replays GWS operations if a wave is
754   // preempted.
755   //
756   // If this is false, a GWS operation requires testing if a nack set the
757   // MEM_VIOL bit, and repeating if so.
758   bool hasGWSAutoReplay() const {
759     return getGeneration() >= GFX9;
760   }
761 
762   /// \returns if target has ds_gws_sema_release_all instruction.
763   bool hasGWSSemaReleaseAll() const {
764     return CIInsts;
765   }
766 
767   bool hasAddNoCarry() const {
768     return AddNoCarryInsts;
769   }
770 
771   bool hasUnpackedD16VMem() const {
772     return HasUnpackedD16VMem;
773   }
774 
775   // Covers VS/PS/CS graphics shaders
776   bool isMesaGfxShader(const Function &F) const {
777     return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
778   }
779 
780   bool hasMad64_32() const {
781     return getGeneration() >= SEA_ISLANDS;
782   }
783 
784   bool hasSDWAOmod() const {
785     return HasSDWAOmod;
786   }
787 
788   bool hasSDWAScalar() const {
789     return HasSDWAScalar;
790   }
791 
792   bool hasSDWASdst() const {
793     return HasSDWASdst;
794   }
795 
796   bool hasSDWAMac() const {
797     return HasSDWAMac;
798   }
799 
800   bool hasSDWAOutModsVOPC() const {
801     return HasSDWAOutModsVOPC;
802   }
803 
804   bool hasDLInsts() const {
805     return HasDLInsts;
806   }
807 
808   bool hasDot1Insts() const {
809     return HasDot1Insts;
810   }
811 
812   bool hasDot2Insts() const {
813     return HasDot2Insts;
814   }
815 
816   bool hasDot3Insts() const {
817     return HasDot3Insts;
818   }
819 
820   bool hasDot4Insts() const {
821     return HasDot4Insts;
822   }
823 
824   bool hasDot5Insts() const {
825     return HasDot5Insts;
826   }
827 
828   bool hasDot6Insts() const {
829     return HasDot6Insts;
830   }
831 
832   bool hasMAIInsts() const {
833     return HasMAIInsts;
834   }
835 
836   bool hasPkFmacF16Inst() const {
837     return HasPkFmacF16Inst;
838   }
839 
840   bool hasAtomicFaddInsts() const {
841     return HasAtomicFaddInsts;
842   }
843 
844   bool isSRAMECCEnabled() const {
845     return EnableSRAMECC;
846   }
847 
848   bool hasNoSdstCMPX() const {
849     return HasNoSdstCMPX;
850   }
851 
852   bool hasVscnt() const {
853     return HasVscnt;
854   }
855 
856   bool hasRegisterBanking() const {
857     return HasRegisterBanking;
858   }
859 
860   bool hasVOP3Literal() const {
861     return HasVOP3Literal;
862   }
863 
864   bool hasNoDataDepHazard() const {
865     return HasNoDataDepHazard;
866   }
867 
868   bool vmemWriteNeedsExpWaitcnt() const {
869     return getGeneration() < SEA_ISLANDS;
870   }
871 
872   // Scratch is allocated in 256 dword per wave blocks for the entire
873   // wavefront. When viewed from the perspecive of an arbitrary workitem, this
874   // is 4-byte aligned.
875   //
876   // Only 4-byte alignment is really needed to access anything. Transformations
877   // on the pointer value itself may rely on the alignment / known low bits of
878   // the pointer. Set this to something above the minimum to avoid needing
879   // dynamic realignment in common cases.
880   Align getStackAlignment() const { return Align(16); }
881 
882   bool enableMachineScheduler() const override {
883     return true;
884   }
885 
886   bool enableSubRegLiveness() const override {
887     return true;
888   }
889 
890   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
891   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
892 
893   /// \returns Number of execution units per compute unit supported by the
894   /// subtarget.
895   unsigned getEUsPerCU() const {
896     return AMDGPU::IsaInfo::getEUsPerCU(this);
897   }
898 
899   /// \returns Maximum number of waves per compute unit supported by the
900   /// subtarget without any kind of limitation.
901   unsigned getMaxWavesPerCU() const {
902     return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
903   }
904 
905   /// \returns Maximum number of waves per compute unit supported by the
906   /// subtarget and limited by given \p FlatWorkGroupSize.
907   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
908     return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
909   }
910 
911   /// \returns Number of waves per work group supported by the subtarget and
912   /// limited by given \p FlatWorkGroupSize.
913   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
914     return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
915   }
916 
917   // static wrappers
918   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
919 
920   // XXX - Why is this here if it isn't in the default pass set?
921   bool enableEarlyIfConversion() const override {
922     return true;
923   }
924 
925   void overrideSchedPolicy(MachineSchedPolicy &Policy,
926                            unsigned NumRegionInstrs) const override;
927 
928   unsigned getMaxNumUserSGPRs() const {
929     return 16;
930   }
931 
932   bool hasSMemRealTime() const {
933     return HasSMemRealTime;
934   }
935 
936   bool hasMovrel() const {
937     return HasMovrel;
938   }
939 
940   bool hasVGPRIndexMode() const {
941     return HasVGPRIndexMode;
942   }
943 
944   bool useVGPRIndexMode() const;
945 
946   bool hasScalarCompareEq64() const {
947     return getGeneration() >= VOLCANIC_ISLANDS;
948   }
949 
950   bool hasScalarStores() const {
951     return HasScalarStores;
952   }
953 
954   bool hasScalarAtomics() const {
955     return HasScalarAtomics;
956   }
957 
958   bool hasLDSFPAtomics() const {
959     return GFX8Insts;
960   }
961 
962   bool hasDPP() const {
963     return HasDPP;
964   }
965 
966   bool hasDPPBroadcasts() const {
967     return HasDPP && getGeneration() < GFX10;
968   }
969 
970   bool hasDPPWavefrontShifts() const {
971     return HasDPP && getGeneration() < GFX10;
972   }
973 
974   bool hasDPP8() const {
975     return HasDPP8;
976   }
977 
978   bool hasR128A16() const {
979     return HasR128A16;
980   }
981 
982   bool hasOffset3fBug() const {
983     return HasOffset3fBug;
984   }
985 
986   bool hasNSAEncoding() const {
987     return HasNSAEncoding;
988   }
989 
990   bool hasMadF16() const;
991 
992   bool enableSIScheduler() const {
993     return EnableSIScheduler;
994   }
995 
996   bool loadStoreOptEnabled() const {
997     return EnableLoadStoreOpt;
998   }
999 
1000   bool hasSGPRInitBug() const {
1001     return SGPRInitBug;
1002   }
1003 
1004   bool hasMFMAInlineLiteralBug() const {
1005     return HasMFMAInlineLiteralBug;
1006   }
1007 
1008   bool has12DWordStoreHazard() const {
1009     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
1010   }
1011 
1012   // \returns true if the subtarget supports DWORDX3 load/store instructions.
1013   bool hasDwordx3LoadStores() const {
1014     return CIInsts;
1015   }
1016 
1017   bool hasSMovFedHazard() const {
1018     return getGeneration() == AMDGPUSubtarget::GFX9;
1019   }
1020 
1021   bool hasReadM0MovRelInterpHazard() const {
1022     return getGeneration() == AMDGPUSubtarget::GFX9;
1023   }
1024 
1025   bool hasReadM0SendMsgHazard() const {
1026     return getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1027            getGeneration() <= AMDGPUSubtarget::GFX9;
1028   }
1029 
1030   bool hasVcmpxPermlaneHazard() const {
1031     return HasVcmpxPermlaneHazard;
1032   }
1033 
1034   bool hasVMEMtoScalarWriteHazard() const {
1035     return HasVMEMtoScalarWriteHazard;
1036   }
1037 
1038   bool hasSMEMtoVectorWriteHazard() const {
1039     return HasSMEMtoVectorWriteHazard;
1040   }
1041 
1042   bool hasLDSMisalignedBug() const {
1043     return LDSMisalignedBug && !EnableCuMode;
1044   }
1045 
1046   bool hasInstFwdPrefetchBug() const {
1047     return HasInstFwdPrefetchBug;
1048   }
1049 
1050   bool hasVcmpxExecWARHazard() const {
1051     return HasVcmpxExecWARHazard;
1052   }
1053 
1054   bool hasLdsBranchVmemWARHazard() const {
1055     return HasLdsBranchVmemWARHazard;
1056   }
1057 
1058   bool hasNSAtoVMEMBug() const {
1059     return HasNSAtoVMEMBug;
1060   }
1061 
1062   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
1063   /// SGPRs
1064   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
1065 
1066   /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
1067   /// VGPRs
1068   unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
1069 
1070   /// Return occupancy for the given function. Used LDS and a number of
1071   /// registers if provided.
1072   /// Note, occupancy can be affected by the scratch allocation as well, but
1073   /// we do not have enough information to compute it.
1074   unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
1075                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
1076 
1077   /// \returns true if the flat_scratch register should be initialized with the
1078   /// pointer to the wave's scratch memory rather than a size and offset.
1079   bool flatScratchIsPointer() const {
1080     return getGeneration() >= AMDGPUSubtarget::GFX9;
1081   }
1082 
1083   /// \returns true if the machine has merged shaders in which s0-s7 are
1084   /// reserved by the hardware and user SGPRs start at s8
1085   bool hasMergedShaders() const {
1086     return getGeneration() >= GFX9;
1087   }
1088 
1089   /// \returns SGPR allocation granularity supported by the subtarget.
1090   unsigned getSGPRAllocGranule() const {
1091     return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
1092   }
1093 
1094   /// \returns SGPR encoding granularity supported by the subtarget.
1095   unsigned getSGPREncodingGranule() const {
1096     return AMDGPU::IsaInfo::getSGPREncodingGranule(this);
1097   }
1098 
1099   /// \returns Total number of SGPRs supported by the subtarget.
1100   unsigned getTotalNumSGPRs() const {
1101     return AMDGPU::IsaInfo::getTotalNumSGPRs(this);
1102   }
1103 
1104   /// \returns Addressable number of SGPRs supported by the subtarget.
1105   unsigned getAddressableNumSGPRs() const {
1106     return AMDGPU::IsaInfo::getAddressableNumSGPRs(this);
1107   }
1108 
1109   /// \returns Minimum number of SGPRs that meets the given number of waves per
1110   /// execution unit requirement supported by the subtarget.
1111   unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
1112     return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
1113   }
1114 
1115   /// \returns Maximum number of SGPRs that meets the given number of waves per
1116   /// execution unit requirement supported by the subtarget.
1117   unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
1118     return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
1119   }
1120 
1121   /// \returns Reserved number of SGPRs for given function \p MF.
1122   unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
1123 
1124   /// \returns Maximum number of SGPRs that meets number of waves per execution
1125   /// unit requirement for function \p MF, or number of SGPRs explicitly
1126   /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
1127   ///
1128   /// \returns Value that meets number of waves per execution unit requirement
1129   /// if explicitly requested value cannot be converted to integer, violates
1130   /// subtarget's specifications, or does not meet number of waves per execution
1131   /// unit requirement.
1132   unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
1133 
1134   /// \returns VGPR allocation granularity supported by the subtarget.
1135   unsigned getVGPRAllocGranule() const {
1136     return AMDGPU::IsaInfo::getVGPRAllocGranule(this);
1137   }
1138 
1139   /// \returns VGPR encoding granularity supported by the subtarget.
1140   unsigned getVGPREncodingGranule() const {
1141     return AMDGPU::IsaInfo::getVGPREncodingGranule(this);
1142   }
1143 
1144   /// \returns Total number of VGPRs supported by the subtarget.
1145   unsigned getTotalNumVGPRs() const {
1146     return AMDGPU::IsaInfo::getTotalNumVGPRs(this);
1147   }
1148 
1149   /// \returns Addressable number of VGPRs supported by the subtarget.
1150   unsigned getAddressableNumVGPRs() const {
1151     return AMDGPU::IsaInfo::getAddressableNumVGPRs(this);
1152   }
1153 
1154   /// \returns Minimum number of VGPRs that meets given number of waves per
1155   /// execution unit requirement supported by the subtarget.
1156   unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
1157     return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU);
1158   }
1159 
1160   /// \returns Maximum number of VGPRs that meets given number of waves per
1161   /// execution unit requirement supported by the subtarget.
1162   unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
1163     return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU);
1164   }
1165 
1166   /// \returns Maximum number of VGPRs that meets number of waves per execution
1167   /// unit requirement for function \p MF, or number of VGPRs explicitly
1168   /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
1169   ///
1170   /// \returns Value that meets number of waves per execution unit requirement
1171   /// if explicitly requested value cannot be converted to integer, violates
1172   /// subtarget's specifications, or does not meet number of waves per execution
1173   /// unit requirement.
1174   unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
1175 
1176   void getPostRAMutations(
1177       std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations)
1178       const override;
1179 
1180   bool isWave32() const {
1181     return WavefrontSize == 32;
1182   }
1183 
1184   const TargetRegisterClass *getBoolRC() const {
1185     return getRegisterInfo()->getBoolRC();
1186   }
1187 
1188   /// \returns Maximum number of work groups per compute unit supported by the
1189   /// subtarget and limited by given \p FlatWorkGroupSize.
1190   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1191     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1192   }
1193 
1194   /// \returns Minimum flat work group size supported by the subtarget.
1195   unsigned getMinFlatWorkGroupSize() const override {
1196     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1197   }
1198 
1199   /// \returns Maximum flat work group size supported by the subtarget.
1200   unsigned getMaxFlatWorkGroupSize() const override {
1201     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1202   }
1203 
1204   /// \returns Maximum number of waves per execution unit supported by the
1205   /// subtarget and limited by given \p FlatWorkGroupSize.
1206   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1207     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1208   }
1209 
1210   /// \returns Minimum number of waves per execution unit supported by the
1211   /// subtarget.
1212   unsigned getMinWavesPerEU() const override {
1213     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1214   }
1215 
1216   void adjustSchedDependency(SUnit *Src, SUnit *Dst, SDep &Dep) const override;
1217 };
1218 
1219 class R600Subtarget final : public R600GenSubtargetInfo,
1220                             public AMDGPUSubtarget {
1221 private:
1222   R600InstrInfo InstrInfo;
1223   R600FrameLowering FrameLowering;
1224   bool FMA;
1225   bool CaymanISA;
1226   bool CFALUBug;
1227   bool HasVertexCache;
1228   bool R600ALUInst;
1229   bool FP64;
1230   short TexVTXClauseSize;
1231   Generation Gen;
1232   R600TargetLowering TLInfo;
1233   InstrItineraryData InstrItins;
1234   SelectionDAGTargetInfo TSInfo;
1235 
1236 public:
1237   R600Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
1238                 const TargetMachine &TM);
1239 
1240   const R600InstrInfo *getInstrInfo() const override { return &InstrInfo; }
1241 
1242   const R600FrameLowering *getFrameLowering() const override {
1243     return &FrameLowering;
1244   }
1245 
1246   const R600TargetLowering *getTargetLowering() const override {
1247     return &TLInfo;
1248   }
1249 
1250   const R600RegisterInfo *getRegisterInfo() const override {
1251     return &InstrInfo.getRegisterInfo();
1252   }
1253 
1254   const InstrItineraryData *getInstrItineraryData() const override {
1255     return &InstrItins;
1256   }
1257 
1258   // Nothing implemented, just prevent crashes on use.
1259   const SelectionDAGTargetInfo *getSelectionDAGInfo() const override {
1260     return &TSInfo;
1261   }
1262 
1263   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
1264 
1265   Generation getGeneration() const {
1266     return Gen;
1267   }
1268 
1269   Align getStackAlignment() const { return Align(4); }
1270 
1271   R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
1272                                                  StringRef GPU, StringRef FS);
1273 
1274   bool hasBFE() const {
1275     return (getGeneration() >= EVERGREEN);
1276   }
1277 
1278   bool hasBFI() const {
1279     return (getGeneration() >= EVERGREEN);
1280   }
1281 
1282   bool hasBCNT(unsigned Size) const {
1283     if (Size == 32)
1284       return (getGeneration() >= EVERGREEN);
1285 
1286     return false;
1287   }
1288 
1289   bool hasBORROW() const {
1290     return (getGeneration() >= EVERGREEN);
1291   }
1292 
1293   bool hasCARRY() const {
1294     return (getGeneration() >= EVERGREEN);
1295   }
1296 
1297   bool hasCaymanISA() const {
1298     return CaymanISA;
1299   }
1300 
1301   bool hasFFBL() const {
1302     return (getGeneration() >= EVERGREEN);
1303   }
1304 
1305   bool hasFFBH() const {
1306     return (getGeneration() >= EVERGREEN);
1307   }
1308 
1309   bool hasFMA() const { return FMA; }
1310 
1311   bool hasCFAluBug() const { return CFALUBug; }
1312 
1313   bool hasVertexCache() const { return HasVertexCache; }
1314 
1315   short getTexVTXClauseSize() const { return TexVTXClauseSize; }
1316 
1317   bool enableMachineScheduler() const override {
1318     return true;
1319   }
1320 
1321   bool enableSubRegLiveness() const override {
1322     return true;
1323   }
1324 
1325   /// \returns Maximum number of work groups per compute unit supported by the
1326   /// subtarget and limited by given \p FlatWorkGroupSize.
1327   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
1328     return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
1329   }
1330 
1331   /// \returns Minimum flat work group size supported by the subtarget.
1332   unsigned getMinFlatWorkGroupSize() const override {
1333     return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(this);
1334   }
1335 
1336   /// \returns Maximum flat work group size supported by the subtarget.
1337   unsigned getMaxFlatWorkGroupSize() const override {
1338     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
1339   }
1340 
1341   /// \returns Maximum number of waves per execution unit supported by the
1342   /// subtarget and limited by given \p FlatWorkGroupSize.
1343   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
1344     return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
1345   }
1346 
1347   /// \returns Minimum number of waves per execution unit supported by the
1348   /// subtarget.
1349   unsigned getMinWavesPerEU() const override {
1350     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
1351   }
1352 };
1353 
1354 } // end namespace llvm
1355 
1356 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
1357