1 //=====-- AMDGPUSubtarget.h - Define Subtarget for AMDGPU -------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Base class for AMDGPU specific classes of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
15 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
16 
17 #include "llvm/IR/CallingConv.h"
18 #include "llvm/Support/Alignment.h"
19 #include "llvm/TargetParser/Triple.h"
20 
21 namespace llvm {
22 
23 enum AMDGPUDwarfFlavour : unsigned;
24 class Function;
25 class Instruction;
26 class MachineFunction;
27 class TargetMachine;
28 
29 class AMDGPUSubtarget {
30 public:
31   enum Generation {
32     INVALID = 0,
33     R600 = 1,
34     R700 = 2,
35     EVERGREEN = 3,
36     NORTHERN_ISLANDS = 4,
37     SOUTHERN_ISLANDS = 5,
38     SEA_ISLANDS = 6,
39     VOLCANIC_ISLANDS = 7,
40     GFX9 = 8,
41     GFX10 = 9,
42     GFX11 = 10,
43     GFX12 = 11,
44   };
45 
46 private:
47   Triple TargetTriple;
48 
49 protected:
50   bool GCN3Encoding = false;
51   bool Has16BitInsts = false;
52   bool HasTrue16BitInsts = false;
53   bool EnableRealTrue16Insts = false;
54   bool HasMadMixInsts = false;
55   bool HasMadMacF32Insts = false;
56   bool HasDsSrc2Insts = false;
57   bool HasSDWA = false;
58   bool HasVOP3PInsts = false;
59   bool HasMulI24 = true;
60   bool HasMulU24 = true;
61   bool HasSMulHi = false;
62   bool HasInv2PiInlineImm = false;
63   bool HasFminFmaxLegacy = true;
64   bool EnablePromoteAlloca = false;
65   bool HasTrigReducedRange = false;
66   bool FastFMAF32 = false;
67   unsigned EUsPerCU = 4;
68   unsigned MaxWavesPerEU = 10;
69   unsigned LocalMemorySize = 0;
70   unsigned AddressableLocalMemorySize = 0;
71   char WavefrontSizeLog2 = 0;
72 
73 public:
74   AMDGPUSubtarget(const Triple &TT);
75 
76   static const AMDGPUSubtarget &get(const MachineFunction &MF);
77   static const AMDGPUSubtarget &get(const TargetMachine &TM,
78                                     const Function &F);
79 
80   /// \returns Default range flat work group size for a calling convention.
81   std::pair<unsigned, unsigned> getDefaultFlatWorkGroupSize(CallingConv::ID CC) const;
82 
83   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
84   /// for function \p F, or minimum/maximum flat work group sizes explicitly
85   /// requested using "amdgpu-flat-work-group-size" attribute attached to
86   /// function \p F.
87   ///
88   /// \returns Subtarget's default values if explicitly requested values cannot
89   /// be converted to integer, or violate subtarget's specifications.
90   std::pair<unsigned, unsigned> getFlatWorkGroupSizes(const Function &F) const;
91 
92   /// \returns Subtarget's default pair of minimum/maximum number of waves per
93   /// execution unit for function \p F, or minimum/maximum number of waves per
94   /// execution unit explicitly requested using "amdgpu-waves-per-eu" attribute
95   /// attached to function \p F.
96   ///
97   /// \returns Subtarget's default values if explicitly requested values cannot
98   /// be converted to integer, violate subtarget's specifications, or are not
99   /// compatible with minimum/maximum number of waves limited by flat work group
100   /// size, register usage, and/or lds usage.
101   std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
102     // Default/requested minimum/maximum flat work group sizes.
103     std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
104     return getWavesPerEU(F, FlatWorkGroupSizes);
105   }
106 
107   /// Overload which uses the specified values for the flat work group sizes,
108   /// rather than querying the function itself. \p FlatWorkGroupSizes Should
109   /// correspond to the function's value for getFlatWorkGroupSizes.
110   std::pair<unsigned, unsigned>
111   getWavesPerEU(const Function &F,
112                 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
113   std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
114       std::pair<unsigned, unsigned> WavesPerEU,
115       std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
116 
117   /// Return the amount of LDS that can be used that will not restrict the
118   /// occupancy lower than WaveCount.
119   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
120                                            const Function &) const;
121 
122   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
123   /// the given LDS memory size is the only constraint.
124   unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
125 
126   unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const;
127 
128   bool isAmdHsaOS() const {
129     return TargetTriple.getOS() == Triple::AMDHSA;
130   }
131 
132   bool isAmdPalOS() const {
133     return TargetTriple.getOS() == Triple::AMDPAL;
134   }
135 
136   bool isMesa3DOS() const {
137     return TargetTriple.getOS() == Triple::Mesa3D;
138   }
139 
140   bool isMesaKernel(const Function &F) const;
141 
142   bool isAmdHsaOrMesa(const Function &F) const {
143     return isAmdHsaOS() || isMesaKernel(F);
144   }
145 
146   bool isGCN() const {
147     return TargetTriple.getArch() == Triple::amdgcn;
148   }
149 
150   bool isGCN3Encoding() const {
151     return GCN3Encoding;
152   }
153 
154   bool has16BitInsts() const {
155     return Has16BitInsts;
156   }
157 
158   /// Return true if the subtarget supports True16 instructions.
159   bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
160 
161   /// Return true if real (non-fake) variants of True16 instructions using
162   /// 16-bit registers should be code-generated. Fake True16 instructions are
163   /// identical to non-fake ones except that they take 32-bit registers as
164   /// operands and always use their low halves.
165   // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
166   // supported and the support for fake True16 instructions is removed.
167   bool useRealTrue16Insts() const;
168 
169   bool hasMadMixInsts() const {
170     return HasMadMixInsts;
171   }
172 
173   bool hasMadMacF32Insts() const {
174     return HasMadMacF32Insts || !isGCN();
175   }
176 
177   bool hasDsSrc2Insts() const {
178     return HasDsSrc2Insts;
179   }
180 
181   bool hasSDWA() const {
182     return HasSDWA;
183   }
184 
185   bool hasVOP3PInsts() const {
186     return HasVOP3PInsts;
187   }
188 
189   bool hasMulI24() const {
190     return HasMulI24;
191   }
192 
193   bool hasMulU24() const {
194     return HasMulU24;
195   }
196 
197   bool hasSMulHi() const {
198     return HasSMulHi;
199   }
200 
201   bool hasInv2PiInlineImm() const {
202     return HasInv2PiInlineImm;
203   }
204 
205   bool hasFminFmaxLegacy() const {
206     return HasFminFmaxLegacy;
207   }
208 
209   bool hasTrigReducedRange() const {
210     return HasTrigReducedRange;
211   }
212 
213   bool hasFastFMAF32() const {
214     return FastFMAF32;
215   }
216 
217   bool isPromoteAllocaEnabled() const {
218     return EnablePromoteAlloca;
219   }
220 
221   unsigned getWavefrontSize() const {
222     return 1 << WavefrontSizeLog2;
223   }
224 
225   unsigned getWavefrontSizeLog2() const {
226     return WavefrontSizeLog2;
227   }
228 
229   unsigned getLocalMemorySize() const {
230     return LocalMemorySize;
231   }
232 
233   unsigned getAddressableLocalMemorySize() const {
234     return AddressableLocalMemorySize;
235   }
236 
237   /// Number of SIMDs/EUs (execution units) per "CU" ("compute unit"), where the
238   /// "CU" is the unit onto which workgroups are mapped. This takes WGP mode vs.
239   /// CU mode into account.
240   unsigned getEUsPerCU() const { return EUsPerCU; }
241 
242   Align getAlignmentForImplicitArgPtr() const {
243     return isAmdHsaOS() ? Align(8) : Align(4);
244   }
245 
246   /// Returns the offset in bytes from the start of the input buffer
247   ///        of the first explicit kernel argument.
248   unsigned getExplicitKernelArgOffset() const {
249     switch (TargetTriple.getOS()) {
250     case Triple::AMDHSA:
251     case Triple::AMDPAL:
252     case Triple::Mesa3D:
253       return 0;
254     case Triple::UnknownOS:
255     default:
256       // For legacy reasons unknown/other is treated as a different version of
257       // mesa.
258       return 36;
259     }
260 
261     llvm_unreachable("invalid triple OS");
262   }
263 
264   /// \returns Maximum number of work groups per compute unit supported by the
265   /// subtarget and limited by given \p FlatWorkGroupSize.
266   virtual unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const = 0;
267 
268   /// \returns Minimum flat work group size supported by the subtarget.
269   virtual unsigned getMinFlatWorkGroupSize() const = 0;
270 
271   /// \returns Maximum flat work group size supported by the subtarget.
272   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
273 
274   /// \returns Number of waves per execution unit required to support the given
275   /// \p FlatWorkGroupSize.
276   virtual unsigned
277   getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
278 
279   /// \returns Minimum number of waves per execution unit supported by the
280   /// subtarget.
281   virtual unsigned getMinWavesPerEU() const = 0;
282 
283   /// \returns Maximum number of waves per execution unit supported by the
284   /// subtarget without any kind of limitation.
285   unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
286 
287   /// Return the maximum workitem ID value in the function, for the given (0, 1,
288   /// 2) dimension.
289   unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
290 
291   /// Return true if only a single workitem can be active in a wave.
292   bool isSingleLaneExecution(const Function &Kernel) const;
293 
294   /// Creates value range metadata on an workitemid.* intrinsic call or load.
295   bool makeLIDRangeMetadata(Instruction *I) const;
296 
297   /// \returns Number of bytes of arguments that are passed to a shader or
298   /// kernel in addition to the explicit ones declared for the function.
299   unsigned getImplicitArgNumBytes(const Function &F) const;
300   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
301   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
302 
303   /// \returns Corresponding DWARF register number mapping flavour for the
304   /// \p WavefrontSize.
305   AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const;
306 
307   virtual ~AMDGPUSubtarget() = default;
308 };
309 
310 } // end namespace llvm
311 
312 #endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
313