1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPUCallLowering.h"
16 #include "AMDGPUInstructionSelector.h"
17 #include "AMDGPULegalizerInfo.h"
18 #include "AMDGPURegisterBankInfo.h"
19 #include "AMDGPUTargetMachine.h"
20 #include "GCNSubtarget.h"
21 #include "R600Subtarget.h"
22 #include "SIMachineFunctionInfo.h"
23 #include "Utils/AMDGPUBaseInfo.h"
24 #include "llvm/ADT/SmallString.h"
25 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
26 #include "llvm/CodeGen/MachineScheduler.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/IR/IntrinsicsR600.h"
30 #include "llvm/IR/MDBuilder.h"
31 #include "llvm/MC/MCSubtargetInfo.h"
32 #include <algorithm>
33
34 using namespace llvm;
35
36 #define DEBUG_TYPE "amdgpu-subtarget"
37
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #define AMDGPUSubtarget GCNSubtarget
41 #include "AMDGPUGenSubtargetInfo.inc"
42 #undef AMDGPUSubtarget
43
44 static cl::opt<bool> EnablePowerSched(
45 "amdgpu-enable-power-sched",
46 cl::desc("Enable scheduling to minimize mAI power bursts"),
47 cl::init(false));
48
49 static cl::opt<bool> EnableVGPRIndexMode(
50 "amdgpu-vgpr-index-mode",
51 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
52 cl::init(false));
53
54 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
55 cl::desc("Enable the use of AA during codegen."),
56 cl::init(true));
57
58 static cl::opt<unsigned> NSAThreshold("amdgpu-nsa-threshold",
59 cl::desc("Number of addresses from which to enable MIMG NSA."),
60 cl::init(3), cl::Hidden);
61
62 GCNSubtarget::~GCNSubtarget() = default;
63
64 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)65 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
66 StringRef GPU, StringRef FS) {
67 // Determine default and user-specified characteristics
68 //
69 // We want to be able to turn these off, but making this a subtarget feature
70 // for SI has the unhelpful behavior that it unsets everything else if you
71 // disable it.
72 //
73 // Similarly we want enable-prt-strict-null to be on by default and not to
74 // unset everything else if it is disabled
75
76 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
77
78 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
79 if (isAmdHsaOS())
80 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
81
82 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
83
84 // Disable mutually exclusive bits.
85 if (FS.contains_insensitive("+wavefrontsize")) {
86 if (!FS.contains_insensitive("wavefrontsize16"))
87 FullFS += "-wavefrontsize16,";
88 if (!FS.contains_insensitive("wavefrontsize32"))
89 FullFS += "-wavefrontsize32,";
90 if (!FS.contains_insensitive("wavefrontsize64"))
91 FullFS += "-wavefrontsize64,";
92 }
93
94 FullFS += FS;
95
96 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
97
98 // Implement the "generic" processors, which acts as the default when no
99 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
100 // the first amdgcn target that supports flat addressing. Other OSes defaults
101 // to the first amdgcn target.
102 if (Gen == AMDGPUSubtarget::INVALID) {
103 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
104 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
105 }
106
107 // We don't support FP64 for EG/NI atm.
108 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
109
110 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
111 // support flat operations, otherwise they cannot access a 64-bit global
112 // address space
113 assert(hasAddr64() || hasFlat());
114 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
115 // that do not support ADDR64 variants of MUBUF instructions. Such targets
116 // cannot use a 64 bit offset with a MUBUF instruction to access the global
117 // address space
118 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
119 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
120 FlatForGlobal = true;
121 }
122 // Unless +-flat-for-global is specified, use MUBUF instructions for global
123 // address space access if flat operations are not available.
124 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
125 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
126 FlatForGlobal = false;
127 }
128
129 // Set defaults if needed.
130 if (MaxPrivateElementSize == 0)
131 MaxPrivateElementSize = 4;
132
133 if (LDSBankCount == 0)
134 LDSBankCount = 32;
135
136 if (TT.getArch() == Triple::amdgcn) {
137 if (LocalMemorySize == 0)
138 LocalMemorySize = 32768;
139
140 // Do something sensible for unspecified target.
141 if (!HasMovrel && !HasVGPRIndexMode)
142 HasMovrel = true;
143 }
144
145 AddressableLocalMemorySize = LocalMemorySize;
146
147 if (AMDGPU::isGFX10Plus(*this) &&
148 !getFeatureBits().test(AMDGPU::FeatureCuMode))
149 LocalMemorySize *= 2;
150
151 // Don't crash on invalid devices.
152 if (WavefrontSizeLog2 == 0)
153 WavefrontSizeLog2 = 5;
154
155 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
156 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
157
158 TargetID.setTargetIDFromFeaturesString(FS);
159
160 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
161 << TargetID.getXnackSetting() << '\n');
162 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
163 << TargetID.getSramEccSetting() << '\n');
164
165 return *this;
166 }
167
AMDGPUSubtarget(const Triple & TT)168 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
169
useRealTrue16Insts() const170 bool AMDGPUSubtarget::useRealTrue16Insts() const {
171 return hasTrue16BitInsts() && EnableRealTrue16Insts;
172 }
173
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)174 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
175 const GCNTargetMachine &TM)
176 : // clang-format off
177 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
178 AMDGPUSubtarget(TT),
179 TargetTriple(TT),
180 TargetID(*this),
181 InstrItins(getInstrItineraryForCPU(GPU)),
182 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
183 TLInfo(TM, *this),
184 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
185 // clang-format on
186 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
187 EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this);
188 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
189 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
190 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
191 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
192 InstSelector.reset(new AMDGPUInstructionSelector(
193 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
194 }
195
getConstantBusLimit(unsigned Opcode) const196 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
197 if (getGeneration() < GFX10)
198 return 1;
199
200 switch (Opcode) {
201 case AMDGPU::V_LSHLREV_B64_e64:
202 case AMDGPU::V_LSHLREV_B64_gfx10:
203 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
204 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
205 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
206 case AMDGPU::V_LSHL_B64_e64:
207 case AMDGPU::V_LSHRREV_B64_e64:
208 case AMDGPU::V_LSHRREV_B64_gfx10:
209 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
210 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
211 case AMDGPU::V_LSHR_B64_e64:
212 case AMDGPU::V_ASHRREV_I64_e64:
213 case AMDGPU::V_ASHRREV_I64_gfx10:
214 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
215 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
216 case AMDGPU::V_ASHR_I64_e64:
217 return 1;
218 }
219
220 return 2;
221 }
222
223 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const224 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
225 switch (Opcode) {
226 case AMDGPU::V_CVT_F16_F32_e32:
227 case AMDGPU::V_CVT_F16_F32_e64:
228 case AMDGPU::V_CVT_F16_U16_e32:
229 case AMDGPU::V_CVT_F16_U16_e64:
230 case AMDGPU::V_CVT_F16_I16_e32:
231 case AMDGPU::V_CVT_F16_I16_e64:
232 case AMDGPU::V_RCP_F16_e64:
233 case AMDGPU::V_RCP_F16_e32:
234 case AMDGPU::V_RSQ_F16_e64:
235 case AMDGPU::V_RSQ_F16_e32:
236 case AMDGPU::V_SQRT_F16_e64:
237 case AMDGPU::V_SQRT_F16_e32:
238 case AMDGPU::V_LOG_F16_e64:
239 case AMDGPU::V_LOG_F16_e32:
240 case AMDGPU::V_EXP_F16_e64:
241 case AMDGPU::V_EXP_F16_e32:
242 case AMDGPU::V_SIN_F16_e64:
243 case AMDGPU::V_SIN_F16_e32:
244 case AMDGPU::V_COS_F16_e64:
245 case AMDGPU::V_COS_F16_e32:
246 case AMDGPU::V_FLOOR_F16_e64:
247 case AMDGPU::V_FLOOR_F16_e32:
248 case AMDGPU::V_CEIL_F16_e64:
249 case AMDGPU::V_CEIL_F16_e32:
250 case AMDGPU::V_TRUNC_F16_e64:
251 case AMDGPU::V_TRUNC_F16_e32:
252 case AMDGPU::V_RNDNE_F16_e64:
253 case AMDGPU::V_RNDNE_F16_e32:
254 case AMDGPU::V_FRACT_F16_e64:
255 case AMDGPU::V_FRACT_F16_e32:
256 case AMDGPU::V_FREXP_MANT_F16_e64:
257 case AMDGPU::V_FREXP_MANT_F16_e32:
258 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
259 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
260 case AMDGPU::V_LDEXP_F16_e64:
261 case AMDGPU::V_LDEXP_F16_e32:
262 case AMDGPU::V_LSHLREV_B16_e64:
263 case AMDGPU::V_LSHLREV_B16_e32:
264 case AMDGPU::V_LSHRREV_B16_e64:
265 case AMDGPU::V_LSHRREV_B16_e32:
266 case AMDGPU::V_ASHRREV_I16_e64:
267 case AMDGPU::V_ASHRREV_I16_e32:
268 case AMDGPU::V_ADD_U16_e64:
269 case AMDGPU::V_ADD_U16_e32:
270 case AMDGPU::V_SUB_U16_e64:
271 case AMDGPU::V_SUB_U16_e32:
272 case AMDGPU::V_SUBREV_U16_e64:
273 case AMDGPU::V_SUBREV_U16_e32:
274 case AMDGPU::V_MUL_LO_U16_e64:
275 case AMDGPU::V_MUL_LO_U16_e32:
276 case AMDGPU::V_ADD_F16_e64:
277 case AMDGPU::V_ADD_F16_e32:
278 case AMDGPU::V_SUB_F16_e64:
279 case AMDGPU::V_SUB_F16_e32:
280 case AMDGPU::V_SUBREV_F16_e64:
281 case AMDGPU::V_SUBREV_F16_e32:
282 case AMDGPU::V_MUL_F16_e64:
283 case AMDGPU::V_MUL_F16_e32:
284 case AMDGPU::V_MAX_F16_e64:
285 case AMDGPU::V_MAX_F16_e32:
286 case AMDGPU::V_MIN_F16_e64:
287 case AMDGPU::V_MIN_F16_e32:
288 case AMDGPU::V_MAX_U16_e64:
289 case AMDGPU::V_MAX_U16_e32:
290 case AMDGPU::V_MIN_U16_e64:
291 case AMDGPU::V_MIN_U16_e32:
292 case AMDGPU::V_MAX_I16_e64:
293 case AMDGPU::V_MAX_I16_e32:
294 case AMDGPU::V_MIN_I16_e64:
295 case AMDGPU::V_MIN_I16_e32:
296 case AMDGPU::V_MAD_F16_e64:
297 case AMDGPU::V_MAD_U16_e64:
298 case AMDGPU::V_MAD_I16_e64:
299 case AMDGPU::V_FMA_F16_e64:
300 case AMDGPU::V_DIV_FIXUP_F16_e64:
301 // On gfx10, all 16-bit instructions preserve the high bits.
302 return getGeneration() <= AMDGPUSubtarget::GFX9;
303 case AMDGPU::V_MADAK_F16:
304 case AMDGPU::V_MADMK_F16:
305 case AMDGPU::V_MAC_F16_e64:
306 case AMDGPU::V_MAC_F16_e32:
307 case AMDGPU::V_FMAMK_F16:
308 case AMDGPU::V_FMAAK_F16:
309 case AMDGPU::V_FMAC_F16_e64:
310 case AMDGPU::V_FMAC_F16_e32:
311 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
312 // instructions maintain the legacy behavior of 0ing. Some instructions
313 // changed to preserving the high bits.
314 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
315 case AMDGPU::V_MAD_MIXLO_F16:
316 case AMDGPU::V_MAD_MIXHI_F16:
317 default:
318 return false;
319 }
320 }
321
322 // Returns the maximum per-workgroup LDS allocation size (in bytes) that still
323 // allows the given function to achieve an occupancy of NWaves waves per
324 // SIMD / EU, taking into account only the function's *maximum* workgroup size.
325 unsigned
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const326 AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
327 const Function &F) const {
328 const unsigned WaveSize = getWavefrontSize();
329 const unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
330 const unsigned WavesPerWorkgroup =
331 std::max(1u, (WorkGroupSize + WaveSize - 1) / WaveSize);
332
333 const unsigned WorkGroupsPerCU =
334 std::max(1u, (NWaves * getEUsPerCU()) / WavesPerWorkgroup);
335
336 return getLocalMemorySize() / WorkGroupsPerCU;
337 }
338
339 // FIXME: Should return min,max range.
340 //
341 // Returns the maximum occupancy, in number of waves per SIMD / EU, that can
342 // be achieved when only the given function is running on the machine; and
343 // taking into account the overall number of wave slots, the (maximum) workgroup
344 // size, and the per-workgroup LDS allocation size.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const345 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
346 const Function &F) const {
347 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
348 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
349 if (!MaxWorkGroupsPerCu)
350 return 0;
351
352 const unsigned WaveSize = getWavefrontSize();
353
354 // FIXME: Do we need to account for alignment requirement of LDS rounding the
355 // size up?
356 // Compute restriction based on LDS usage
357 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
358
359 // This can be queried with more LDS than is possible, so just assume the
360 // worst.
361 if (NumGroups == 0)
362 return 1;
363
364 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
365
366 // Round to the number of waves per CU.
367 const unsigned MaxGroupNumWaves = divideCeil(MaxWorkGroupSize, WaveSize);
368 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
369
370 // Number of waves per EU (SIMD).
371 MaxWaves = divideCeil(MaxWaves, getEUsPerCU());
372
373 // Clamp to the maximum possible number of waves.
374 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
375
376 // FIXME: Needs to be a multiple of the group size?
377 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
378
379 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
380 "computed invalid occupancy");
381 return MaxWaves;
382 }
383
384 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const385 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
386 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
387 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
388 }
389
390 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const391 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
392 switch (CC) {
393 case CallingConv::AMDGPU_VS:
394 case CallingConv::AMDGPU_LS:
395 case CallingConv::AMDGPU_HS:
396 case CallingConv::AMDGPU_ES:
397 case CallingConv::AMDGPU_GS:
398 case CallingConv::AMDGPU_PS:
399 return std::pair(1, getWavefrontSize());
400 default:
401 return std::pair(1u, getMaxFlatWorkGroupSize());
402 }
403 }
404
getFlatWorkGroupSizes(const Function & F) const405 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
406 const Function &F) const {
407 // Default minimum/maximum flat work group sizes.
408 std::pair<unsigned, unsigned> Default =
409 getDefaultFlatWorkGroupSize(F.getCallingConv());
410
411 // Requested minimum/maximum flat work group sizes.
412 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
413 F, "amdgpu-flat-work-group-size", Default);
414
415 // Make sure requested minimum is less than requested maximum.
416 if (Requested.first > Requested.second)
417 return Default;
418
419 // Make sure requested values do not violate subtarget's specifications.
420 if (Requested.first < getMinFlatWorkGroupSize())
421 return Default;
422 if (Requested.second > getMaxFlatWorkGroupSize())
423 return Default;
424
425 return Requested;
426 }
427
getEffectiveWavesPerEU(std::pair<unsigned,unsigned> Requested,std::pair<unsigned,unsigned> FlatWorkGroupSizes) const428 std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
429 std::pair<unsigned, unsigned> Requested,
430 std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
431 // Default minimum/maximum number of waves per execution unit.
432 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
433
434 // If minimum/maximum flat work group sizes were explicitly requested using
435 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
436 // number of waves per execution unit to values implied by requested
437 // minimum/maximum flat work group sizes.
438 unsigned MinImpliedByFlatWorkGroupSize =
439 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
440 Default.first = MinImpliedByFlatWorkGroupSize;
441
442 // Make sure requested minimum is less than requested maximum.
443 if (Requested.second && Requested.first > Requested.second)
444 return Default;
445
446 // Make sure requested values do not violate subtarget's specifications.
447 if (Requested.first < getMinWavesPerEU() ||
448 Requested.second > getMaxWavesPerEU())
449 return Default;
450
451 // Make sure requested values are compatible with values implied by requested
452 // minimum/maximum flat work group sizes.
453 if (Requested.first < MinImpliedByFlatWorkGroupSize)
454 return Default;
455
456 return Requested;
457 }
458
getWavesPerEU(const Function & F,std::pair<unsigned,unsigned> FlatWorkGroupSizes) const459 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
460 const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
461 // Default minimum/maximum number of waves per execution unit.
462 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
463
464 // Requested minimum/maximum number of waves per execution unit.
465 std::pair<unsigned, unsigned> Requested =
466 AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
467 return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
468 }
469
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)470 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
471 auto Node = Kernel.getMetadata("reqd_work_group_size");
472 if (Node && Node->getNumOperands() == 3)
473 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
474 return std::numeric_limits<unsigned>::max();
475 }
476
isMesaKernel(const Function & F) const477 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
478 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
479 }
480
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const481 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
482 unsigned Dimension) const {
483 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
484 if (ReqdSize != std::numeric_limits<unsigned>::max())
485 return ReqdSize - 1;
486 return getFlatWorkGroupSizes(Kernel).second - 1;
487 }
488
isSingleLaneExecution(const Function & Func) const489 bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
490 for (int I = 0; I < 3; ++I) {
491 if (getMaxWorkitemID(Func, I) > 0)
492 return false;
493 }
494
495 return true;
496 }
497
makeLIDRangeMetadata(Instruction * I) const498 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
499 Function *Kernel = I->getParent()->getParent();
500 unsigned MinSize = 0;
501 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
502 bool IdQuery = false;
503
504 // If reqd_work_group_size is present it narrows value down.
505 if (auto *CI = dyn_cast<CallInst>(I)) {
506 const Function *F = CI->getCalledFunction();
507 if (F) {
508 unsigned Dim = UINT_MAX;
509 switch (F->getIntrinsicID()) {
510 case Intrinsic::amdgcn_workitem_id_x:
511 case Intrinsic::r600_read_tidig_x:
512 IdQuery = true;
513 [[fallthrough]];
514 case Intrinsic::r600_read_local_size_x:
515 Dim = 0;
516 break;
517 case Intrinsic::amdgcn_workitem_id_y:
518 case Intrinsic::r600_read_tidig_y:
519 IdQuery = true;
520 [[fallthrough]];
521 case Intrinsic::r600_read_local_size_y:
522 Dim = 1;
523 break;
524 case Intrinsic::amdgcn_workitem_id_z:
525 case Intrinsic::r600_read_tidig_z:
526 IdQuery = true;
527 [[fallthrough]];
528 case Intrinsic::r600_read_local_size_z:
529 Dim = 2;
530 break;
531 default:
532 break;
533 }
534
535 if (Dim <= 3) {
536 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
537 if (ReqdSize != std::numeric_limits<unsigned>::max())
538 MinSize = MaxSize = ReqdSize;
539 }
540 }
541 }
542
543 if (!MaxSize)
544 return false;
545
546 // Range metadata is [Lo, Hi). For ID query we need to pass max size
547 // as Hi. For size query we need to pass Hi + 1.
548 if (IdQuery)
549 MinSize = 0;
550 else
551 ++MaxSize;
552
553 MDBuilder MDB(I->getContext());
554 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
555 APInt(32, MaxSize));
556 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
557 return true;
558 }
559
getImplicitArgNumBytes(const Function & F) const560 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
561 assert(AMDGPU::isKernel(F.getCallingConv()));
562
563 // We don't allocate the segment if we know the implicit arguments weren't
564 // used, even if the ABI implies we need them.
565 if (F.hasFnAttribute("amdgpu-no-implicitarg-ptr"))
566 return 0;
567
568 if (isMesaKernel(F))
569 return 16;
570
571 // Assume all implicit inputs are used by default
572 const Module *M = F.getParent();
573 unsigned NBytes =
574 AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
575 return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
576 NBytes);
577 }
578
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const579 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
580 Align &MaxAlign) const {
581 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
582 F.getCallingConv() == CallingConv::SPIR_KERNEL);
583
584 const DataLayout &DL = F.getParent()->getDataLayout();
585 uint64_t ExplicitArgBytes = 0;
586 MaxAlign = Align(1);
587
588 for (const Argument &Arg : F.args()) {
589 const bool IsByRef = Arg.hasByRefAttr();
590 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
591 Align Alignment = DL.getValueOrABITypeAlignment(
592 IsByRef ? Arg.getParamAlign() : std::nullopt, ArgTy);
593 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
594 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
595 MaxAlign = std::max(MaxAlign, Alignment);
596 }
597
598 return ExplicitArgBytes;
599 }
600
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const601 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
602 Align &MaxAlign) const {
603 if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
604 F.getCallingConv() != CallingConv::SPIR_KERNEL)
605 return 0;
606
607 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
608
609 unsigned ExplicitOffset = getExplicitKernelArgOffset();
610
611 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
612 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
613 if (ImplicitBytes != 0) {
614 const Align Alignment = getAlignmentForImplicitArgPtr();
615 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
616 MaxAlign = std::max(MaxAlign, Alignment);
617 }
618
619 // Being able to dereference past the end is useful for emitting scalar loads.
620 return alignTo(TotalSize, 4);
621 }
622
getAMDGPUDwarfFlavour() const623 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
624 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
625 : AMDGPUDwarfFlavour::Wave64;
626 }
627
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const628 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
629 unsigned NumRegionInstrs) const {
630 // Track register pressure so the scheduler can try to decrease
631 // pressure once register usage is above the threshold defined by
632 // SIRegisterInfo::getRegPressureSetLimit()
633 Policy.ShouldTrackPressure = true;
634
635 // Enabling both top down and bottom up scheduling seems to give us less
636 // register spills than just using one of these approaches on its own.
637 Policy.OnlyTopDown = false;
638 Policy.OnlyBottomUp = false;
639
640 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
641 if (!enableSIScheduler())
642 Policy.ShouldTrackLaneMasks = true;
643 }
644
hasMadF16() const645 bool GCNSubtarget::hasMadF16() const {
646 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
647 }
648
useVGPRIndexMode() const649 bool GCNSubtarget::useVGPRIndexMode() const {
650 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
651 }
652
useAA() const653 bool GCNSubtarget::useAA() const { return UseAA; }
654
getOccupancyWithNumSGPRs(unsigned SGPRs) const655 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
656 if (getGeneration() >= AMDGPUSubtarget::GFX10)
657 return getMaxWavesPerEU();
658
659 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
660 if (SGPRs <= 80)
661 return 10;
662 if (SGPRs <= 88)
663 return 9;
664 if (SGPRs <= 100)
665 return 8;
666 return 7;
667 }
668 if (SGPRs <= 48)
669 return 10;
670 if (SGPRs <= 56)
671 return 9;
672 if (SGPRs <= 64)
673 return 8;
674 if (SGPRs <= 72)
675 return 7;
676 if (SGPRs <= 80)
677 return 6;
678 return 5;
679 }
680
getOccupancyWithNumVGPRs(unsigned NumVGPRs) const681 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const {
682 return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs);
683 }
684
685 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratch) const686 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
687 if (getGeneration() >= AMDGPUSubtarget::GFX10)
688 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
689
690 if (HasFlatScratch || HasArchitectedFlatScratch) {
691 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
692 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
693 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
694 return 4; // FLAT_SCRATCH, VCC (in that order).
695 }
696
697 if (isXNACKEnabled())
698 return 4; // XNACK, VCC (in that order).
699 return 2; // VCC.
700 }
701
getReservedNumSGPRs(const MachineFunction & MF) const702 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
703 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
704 return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
705 }
706
getReservedNumSGPRs(const Function & F) const707 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
708 // In principle we do not need to reserve SGPR pair used for flat_scratch if
709 // we know flat instructions do not access the stack anywhere in the
710 // program. For now assume it's needed if we have flat instructions.
711 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
712 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
713 }
714
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const715 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
716 unsigned NumSGPRs,
717 unsigned NumVGPRs) const {
718 unsigned Occupancy =
719 std::min(getMaxWavesPerEU(),
720 getOccupancyWithLocalMemSize(LDSSize, F));
721 if (NumSGPRs)
722 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
723 if (NumVGPRs)
724 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
725 return Occupancy;
726 }
727
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const728 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
729 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
730 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
731 // Compute maximum number of SGPRs function can use using default/requested
732 // minimum number of waves per execution unit.
733 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
734 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
735
736 // Check if maximum number of SGPRs was explicitly requested using
737 // "amdgpu-num-sgpr" attribute.
738 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
739 unsigned Requested =
740 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
741
742 // Make sure requested value does not violate subtarget's specifications.
743 if (Requested && (Requested <= ReservedNumSGPRs))
744 Requested = 0;
745
746 // If more SGPRs are required to support the input user/system SGPRs,
747 // increase to accommodate them.
748 //
749 // FIXME: This really ends up using the requested number of SGPRs + number
750 // of reserved special registers in total. Theoretically you could re-use
751 // the last input registers for these special registers, but this would
752 // require a lot of complexity to deal with the weird aliasing.
753 unsigned InputNumSGPRs = PreloadedSGPRs;
754 if (Requested && Requested < InputNumSGPRs)
755 Requested = InputNumSGPRs;
756
757 // Make sure requested value is compatible with values implied by
758 // default/requested minimum/maximum number of waves per execution unit.
759 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
760 Requested = 0;
761 if (WavesPerEU.second &&
762 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
763 Requested = 0;
764
765 if (Requested)
766 MaxNumSGPRs = Requested;
767 }
768
769 if (hasSGPRInitBug())
770 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
771
772 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
773 }
774
getMaxNumSGPRs(const MachineFunction & MF) const775 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
776 const Function &F = MF.getFunction();
777 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
778 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
779 getReservedNumSGPRs(MF));
780 }
781
getMaxNumPreloadedSGPRs()782 static unsigned getMaxNumPreloadedSGPRs() {
783 using USI = GCNUserSGPRUsageInfo;
784 // Max number of user SGPRs
785 const unsigned MaxUserSGPRs =
786 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
787 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
788 USI::getNumUserSGPRForField(USI::QueuePtrID) +
789 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
790 USI::getNumUserSGPRForField(USI::DispatchIdID) +
791 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
792 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
793
794 // Max number of system SGPRs
795 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
796 1 + // WorkGroupIDY
797 1 + // WorkGroupIDZ
798 1 + // WorkGroupInfo
799 1; // private segment wave byte offset
800
801 // Max number of synthetic SGPRs
802 const unsigned SyntheticSGPRs = 1; // LDSKernelId
803
804 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
805 }
806
getMaxNumSGPRs(const Function & F) const807 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
808 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
809 getReservedNumSGPRs(F));
810 }
811
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU) const812 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
813 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
814 // Compute maximum number of VGPRs function can use using default/requested
815 // minimum number of waves per execution unit.
816 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
817
818 // Check if maximum number of VGPRs was explicitly requested using
819 // "amdgpu-num-vgpr" attribute.
820 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
821 unsigned Requested =
822 F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs);
823
824 if (hasGFX90AInsts())
825 Requested *= 2;
826
827 // Make sure requested value is compatible with values implied by
828 // default/requested minimum/maximum number of waves per execution unit.
829 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
830 Requested = 0;
831 if (WavesPerEU.second &&
832 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
833 Requested = 0;
834
835 if (Requested)
836 MaxNumVGPRs = Requested;
837 }
838
839 return MaxNumVGPRs;
840 }
841
getMaxNumVGPRs(const Function & F) const842 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
843 return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
844 }
845
getMaxNumVGPRs(const MachineFunction & MF) const846 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
847 const Function &F = MF.getFunction();
848 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
849 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
850 }
851
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const852 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
853 int UseOpIdx, SDep &Dep) const {
854 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
855 !Def->isInstr() || !Use->isInstr())
856 return;
857
858 MachineInstr *DefI = Def->getInstr();
859 MachineInstr *UseI = Use->getInstr();
860
861 if (DefI->isBundle()) {
862 const SIRegisterInfo *TRI = getRegisterInfo();
863 auto Reg = Dep.getReg();
864 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
865 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
866 unsigned Lat = 0;
867 for (++I; I != E && I->isBundledWithPred(); ++I) {
868 if (I->modifiesRegister(Reg, TRI))
869 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
870 else if (Lat)
871 --Lat;
872 }
873 Dep.setLatency(Lat);
874 } else if (UseI->isBundle()) {
875 const SIRegisterInfo *TRI = getRegisterInfo();
876 auto Reg = Dep.getReg();
877 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
878 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
879 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
880 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
881 if (I->readsRegister(Reg, TRI))
882 break;
883 --Lat;
884 }
885 Dep.setLatency(Lat);
886 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
887 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
888 // implicit operands which come from the MCInstrDesc, which can fool
889 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
890 // pseudo operands.
891 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
892 DefI, DefOpIdx, UseI, UseOpIdx));
893 }
894 }
895
896 namespace {
897 struct FillMFMAShadowMutation : ScheduleDAGMutation {
898 const SIInstrInfo *TII;
899
900 ScheduleDAGMI *DAG;
901
FillMFMAShadowMutation__anon52782c2a0111::FillMFMAShadowMutation902 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
903
isSALU__anon52782c2a0111::FillMFMAShadowMutation904 bool isSALU(const SUnit *SU) const {
905 const MachineInstr *MI = SU->getInstr();
906 return MI && TII->isSALU(*MI) && !MI->isTerminator();
907 }
908
isVALU__anon52782c2a0111::FillMFMAShadowMutation909 bool isVALU(const SUnit *SU) const {
910 const MachineInstr *MI = SU->getInstr();
911 return MI && TII->isVALU(*MI);
912 }
913
914 // Link as many SALU instructions in chain as possible. Return the size
915 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anon52782c2a0111::FillMFMAShadowMutation916 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
917 SmallPtrSetImpl<SUnit *> &Visited) const {
918 SmallVector<SUnit *, 8> Worklist({To});
919 unsigned Linked = 0;
920
921 while (!Worklist.empty() && MaxChain-- > 0) {
922 SUnit *SU = Worklist.pop_back_val();
923 if (!Visited.insert(SU).second)
924 continue;
925
926 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
927 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
928
929 if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From))
930 if (DAG->addEdge(SU, SDep(From, SDep::Artificial)))
931 ++Linked;
932
933 for (SDep &SI : From->Succs) {
934 SUnit *SUv = SI.getSUnit();
935 if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) &&
936 DAG->canAddEdge(SUv, SU))
937 DAG->addEdge(SUv, SDep(SU, SDep::Artificial));
938 }
939
940 for (SDep &SI : SU->Succs) {
941 SUnit *Succ = SI.getSUnit();
942 if (Succ != SU && isSALU(Succ))
943 Worklist.push_back(Succ);
944 }
945 }
946
947 return Linked;
948 }
949
apply__anon52782c2a0111::FillMFMAShadowMutation950 void apply(ScheduleDAGInstrs *DAGInstrs) override {
951 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
952 if (!ST.hasMAIInsts())
953 return;
954 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
955 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
956 if (!TSchedModel || DAG->SUnits.empty())
957 return;
958
959 // Scan for MFMA long latency instructions and try to add a dependency
960 // of available SALU instructions to give them a chance to fill MFMA
961 // shadow. That is desirable to fill MFMA shadow with SALU instructions
962 // rather than VALU to prevent power consumption bursts and throttle.
963 auto LastSALU = DAG->SUnits.begin();
964 auto E = DAG->SUnits.end();
965 SmallPtrSet<SUnit*, 32> Visited;
966 for (SUnit &SU : DAG->SUnits) {
967 MachineInstr &MAI = *SU.getInstr();
968 if (!TII->isMAI(MAI) ||
969 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
970 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
971 continue;
972
973 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
974
975 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
976 dbgs() << "Need " << Lat
977 << " instructions to cover latency.\n");
978
979 // Find up to Lat independent scalar instructions as early as
980 // possible such that they can be scheduled after this MFMA.
981 for ( ; Lat && LastSALU != E; ++LastSALU) {
982 if (Visited.count(&*LastSALU))
983 continue;
984
985 if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) ||
986 !DAG->canAddEdge(&*LastSALU, &SU))
987 continue;
988
989 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
990 }
991 }
992 }
993 };
994 } // namespace
995
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const996 void GCNSubtarget::getPostRAMutations(
997 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
998 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
999 }
1000
1001 std::unique_ptr<ScheduleDAGMutation>
createFillMFMAShadowMutation(const TargetInstrInfo * TII) const1002 GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
1003 return EnablePowerSched ? std::make_unique<FillMFMAShadowMutation>(&InstrInfo)
1004 : nullptr;
1005 }
1006
getNSAThreshold(const MachineFunction & MF) const1007 unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
1008 if (getGeneration() >= AMDGPUSubtarget::GFX12)
1009 return 0; // Not MIMG encoding.
1010
1011 if (NSAThreshold.getNumOccurrences() > 0)
1012 return std::max(NSAThreshold.getValue(), 2u);
1013
1014 int Value = MF.getFunction().getFnAttributeAsParsedInteger(
1015 "amdgpu-nsa-threshold", -1);
1016 if (Value > 0)
1017 return std::max(Value, 2);
1018
1019 return 3;
1020 }
1021
get(const MachineFunction & MF)1022 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1023 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1024 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1025 else
1026 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1027 }
1028
get(const TargetMachine & TM,const Function & F)1029 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1030 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1031 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1032 else
1033 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1034 }
1035
GCNUserSGPRUsageInfo(const Function & F,const GCNSubtarget & ST)1036 GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
1037 const GCNSubtarget &ST)
1038 : ST(ST) {
1039 const CallingConv::ID CC = F.getCallingConv();
1040 const bool IsKernel =
1041 CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
1042 // FIXME: Should have analysis or something rather than attribute to detect
1043 // calls.
1044 const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
1045 // FIXME: This attribute is a hack, we just need an analysis on the function
1046 // to look for allocas.
1047 const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
1048
1049 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
1050 KernargSegmentPtr = true;
1051
1052 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
1053 if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
1054 PrivateSegmentBuffer = true;
1055 else if (ST.isMesaGfxShader(F))
1056 ImplicitBufferPtr = true;
1057
1058 if (!AMDGPU::isGraphics(CC)) {
1059 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
1060 DispatchPtr = true;
1061
1062 // FIXME: Can this always be disabled with < COv5?
1063 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
1064 QueuePtr = true;
1065
1066 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
1067 DispatchID = true;
1068 }
1069
1070 // TODO: This could be refined a lot. The attribute is a poor way of
1071 // detecting calls or stack objects that may require it before argument
1072 // lowering.
1073 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
1074 (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
1075 (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
1076 !ST.flatScratchIsArchitected()) {
1077 FlatScratchInit = true;
1078 }
1079
1080 if (hasImplicitBufferPtr())
1081 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
1082
1083 if (hasPrivateSegmentBuffer())
1084 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
1085
1086 if (hasDispatchPtr())
1087 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
1088
1089 if (hasQueuePtr())
1090 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
1091
1092 if (hasKernargSegmentPtr())
1093 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
1094
1095 if (hasDispatchID())
1096 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
1097
1098 if (hasFlatScratchInit())
1099 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
1100 }
1101
allocKernargPreloadSGPRs(unsigned NumSGPRs)1102 void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
1103 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
1104 NumKernargPreloadSGPRs += NumSGPRs;
1105 NumUsedUserSGPRs += NumSGPRs;
1106 }
1107
getNumFreeUserSGPRs()1108 unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
1109 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
1110 }
1111