1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32
33 using namespace llvm;
34
35 #define DEBUG_TYPE "amdgpu-subtarget"
36
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45
46 static cl::opt<bool> DisablePowerSched(
47 "amdgpu-disable-power-sched",
48 cl::desc("Disable scheduling to minimize mAI power bursts"),
49 cl::init(false));
50
51 static cl::opt<bool> EnableVGPRIndexMode(
52 "amdgpu-vgpr-index-mode",
53 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54 cl::init(false));
55
56 static cl::opt<bool> EnableFlatScratch(
57 "amdgpu-enable-flat-scratch",
58 cl::desc("Use flat scratch instructions"),
59 cl::init(false));
60
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62 cl::desc("Enable the use of AA during codegen."),
63 cl::init(true));
64
65 GCNSubtarget::~GCNSubtarget() = default;
66
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69 StringRef GPU, StringRef FS) {
70 SmallString<256> FullFS("+promote-alloca,");
71 FullFS += FS;
72 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73
74 HasMulU24 = getGeneration() >= EVERGREEN;
75 HasMulI24 = hasCaymanISA();
76
77 return *this;
78 }
79
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82 StringRef GPU, StringRef FS) {
83 // Determine default and user-specified characteristics
84 //
85 // We want to be able to turn these off, but making this a subtarget feature
86 // for SI has the unhelpful behavior that it unsets everything else if you
87 // disable it.
88 //
89 // Similarly we want enable-prt-strict-null to be on by default and not to
90 // unset everything else if it is disabled
91
92 SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93
94 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95 if (isAmdHsaOS())
96 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97
98 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99
100 // Disable mutually exclusive bits.
101 if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
102 if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
103 FullFS += "-wavefrontsize16,";
104 if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
105 FullFS += "-wavefrontsize32,";
106 if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
107 FullFS += "-wavefrontsize64,";
108 }
109
110 FullFS += FS;
111
112 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113
114 // Implement the "generic" processors, which acts as the default when no
115 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116 // the first amdgcn target that supports flat addressing. Other OSes defaults
117 // to the first amdgcn target.
118 if (Gen == AMDGPUSubtarget::INVALID) {
119 Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120 : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121 }
122
123 // We don't support FP64 for EG/NI atm.
124 assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125
126 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127 // support flat operations, otherwise they cannot access a 64-bit global
128 // address space
129 assert(hasAddr64() || hasFlat());
130 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131 // that do not support ADDR64 variants of MUBUF instructions. Such targets
132 // cannot use a 64 bit offset with a MUBUF instruction to access the global
133 // address space
134 if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136 FlatForGlobal = true;
137 }
138 // Unless +-flat-for-global is specified, use MUBUF instructions for global
139 // address space access if flat operations are not available.
140 if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141 ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142 FlatForGlobal = false;
143 }
144
145 // Set defaults if needed.
146 if (MaxPrivateElementSize == 0)
147 MaxPrivateElementSize = 4;
148
149 if (LDSBankCount == 0)
150 LDSBankCount = 32;
151
152 if (TT.getArch() == Triple::amdgcn) {
153 if (LocalMemorySize == 0)
154 LocalMemorySize = 32768;
155
156 // Do something sensible for unspecified target.
157 if (!HasMovrel && !HasVGPRIndexMode)
158 HasMovrel = true;
159 }
160
161 // Don't crash on invalid devices.
162 if (WavefrontSizeLog2 == 0)
163 WavefrontSizeLog2 = 5;
164
165 HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166 HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
167
168 TargetID.setTargetIDFromFeaturesString(FS);
169
170 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
171 << TargetID.getXnackSetting() << '\n');
172 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
173 << TargetID.getSramEccSetting() << '\n');
174
175 return *this;
176 }
177
AMDGPUSubtarget(const Triple & TT)178 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
179 TargetTriple(TT),
180 GCN3Encoding(false),
181 Has16BitInsts(false),
182 HasMadMixInsts(false),
183 HasMadMacF32Insts(false),
184 HasDsSrc2Insts(false),
185 HasSDWA(false),
186 HasVOP3PInsts(false),
187 HasMulI24(true),
188 HasMulU24(true),
189 HasSMulHi(false),
190 HasInv2PiInlineImm(false),
191 HasFminFmaxLegacy(true),
192 EnablePromoteAlloca(false),
193 HasTrigReducedRange(false),
194 MaxWavesPerEU(10),
195 LocalMemorySize(0),
196 WavefrontSizeLog2(0)
197 { }
198
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)199 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
200 const GCNTargetMachine &TM)
201 : // clang-format off
202 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
203 AMDGPUSubtarget(TT),
204 TargetTriple(TT),
205 TargetID(*this),
206 Gen(INVALID),
207 InstrItins(getInstrItineraryForCPU(GPU)),
208 LDSBankCount(0),
209 MaxPrivateElementSize(0),
210
211 FastFMAF32(false),
212 FastDenormalF32(false),
213 HalfRate64Ops(false),
214 FullRate64Ops(false),
215
216 FlatForGlobal(false),
217 AutoWaitcntBeforeBarrier(false),
218 UnalignedScratchAccess(false),
219 UnalignedAccessMode(false),
220
221 HasApertureRegs(false),
222 SupportsXNACK(false),
223 EnableXNACK(false),
224 EnableTgSplit(false),
225 EnableCuMode(false),
226 TrapHandler(false),
227
228 EnableLoadStoreOpt(false),
229 EnableUnsafeDSOffsetFolding(false),
230 EnableSIScheduler(false),
231 EnableDS128(false),
232 EnablePRTStrictNull(false),
233 DumpCode(false),
234
235 FP64(false),
236 CIInsts(false),
237 GFX8Insts(false),
238 GFX9Insts(false),
239 GFX90AInsts(false),
240 GFX10Insts(false),
241 GFX10_3Insts(false),
242 GFX7GFX8GFX9Insts(false),
243 SGPRInitBug(false),
244 NegativeScratchOffsetBug(false),
245 NegativeUnalignedScratchOffsetBug(false),
246 HasSMemRealTime(false),
247 HasIntClamp(false),
248 HasFmaMixInsts(false),
249 HasMovrel(false),
250 HasVGPRIndexMode(false),
251 HasScalarStores(false),
252 HasScalarAtomics(false),
253 HasSDWAOmod(false),
254 HasSDWAScalar(false),
255 HasSDWASdst(false),
256 HasSDWAMac(false),
257 HasSDWAOutModsVOPC(false),
258 HasDPP(false),
259 HasDPP8(false),
260 Has64BitDPP(false),
261 HasPackedFP32Ops(false),
262 HasExtendedImageInsts(false),
263 HasR128A16(false),
264 HasGFX10A16(false),
265 HasG16(false),
266 HasNSAEncoding(false),
267 NSAMaxSize(0),
268 GFX10_AEncoding(false),
269 GFX10_BEncoding(false),
270 HasDLInsts(false),
271 HasDot1Insts(false),
272 HasDot2Insts(false),
273 HasDot3Insts(false),
274 HasDot4Insts(false),
275 HasDot5Insts(false),
276 HasDot6Insts(false),
277 HasDot7Insts(false),
278 HasMAIInsts(false),
279 HasPkFmacF16Inst(false),
280 HasAtomicFaddInsts(false),
281 SupportsSRAMECC(false),
282 EnableSRAMECC(false),
283 HasNoSdstCMPX(false),
284 HasVscnt(false),
285 HasGetWaveIdInst(false),
286 HasSMemTimeInst(false),
287 HasShaderCyclesRegister(false),
288 HasRegisterBanking(false),
289 HasVOP3Literal(false),
290 HasNoDataDepHazard(false),
291 FlatAddressSpace(false),
292 FlatInstOffsets(false),
293 FlatGlobalInsts(false),
294 FlatScratchInsts(false),
295 ScalarFlatScratchInsts(false),
296 HasArchitectedFlatScratch(false),
297 AddNoCarryInsts(false),
298 HasUnpackedD16VMem(false),
299 LDSMisalignedBug(false),
300 HasMFMAInlineLiteralBug(false),
301 UnalignedBufferAccess(false),
302 UnalignedDSAccess(false),
303 HasPackedTID(false),
304
305 ScalarizeGlobal(false),
306
307 HasVcmpxPermlaneHazard(false),
308 HasVMEMtoScalarWriteHazard(false),
309 HasSMEMtoVectorWriteHazard(false),
310 HasInstFwdPrefetchBug(false),
311 HasVcmpxExecWARHazard(false),
312 HasLdsBranchVmemWARHazard(false),
313 HasNSAtoVMEMBug(false),
314 HasNSAClauseBug(false),
315 HasOffset3fBug(false),
316 HasFlatSegmentOffsetBug(false),
317 HasImageStoreD16Bug(false),
318 HasImageGather4D16Bug(false),
319
320 FeatureDisable(false),
321 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
322 TLInfo(TM, *this),
323 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
324 // clang-format on
325 MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
326 CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
327 InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
328 Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
329 RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
330 InstSelector.reset(new AMDGPUInstructionSelector(
331 *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
332 }
333
enableFlatScratch() const334 bool GCNSubtarget::enableFlatScratch() const {
335 return flatScratchIsArchitected() ||
336 (EnableFlatScratch && hasFlatScratchInsts());
337 }
338
getConstantBusLimit(unsigned Opcode) const339 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
340 if (getGeneration() < GFX10)
341 return 1;
342
343 switch (Opcode) {
344 case AMDGPU::V_LSHLREV_B64_e64:
345 case AMDGPU::V_LSHLREV_B64_gfx10:
346 case AMDGPU::V_LSHL_B64_e64:
347 case AMDGPU::V_LSHRREV_B64_e64:
348 case AMDGPU::V_LSHRREV_B64_gfx10:
349 case AMDGPU::V_LSHR_B64_e64:
350 case AMDGPU::V_ASHRREV_I64_e64:
351 case AMDGPU::V_ASHRREV_I64_gfx10:
352 case AMDGPU::V_ASHR_I64_e64:
353 return 1;
354 }
355
356 return 2;
357 }
358
359 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const360 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
361 switch (Opcode) {
362 case AMDGPU::V_CVT_F16_F32_e32:
363 case AMDGPU::V_CVT_F16_F32_e64:
364 case AMDGPU::V_CVT_F16_U16_e32:
365 case AMDGPU::V_CVT_F16_U16_e64:
366 case AMDGPU::V_CVT_F16_I16_e32:
367 case AMDGPU::V_CVT_F16_I16_e64:
368 case AMDGPU::V_RCP_F16_e64:
369 case AMDGPU::V_RCP_F16_e32:
370 case AMDGPU::V_RSQ_F16_e64:
371 case AMDGPU::V_RSQ_F16_e32:
372 case AMDGPU::V_SQRT_F16_e64:
373 case AMDGPU::V_SQRT_F16_e32:
374 case AMDGPU::V_LOG_F16_e64:
375 case AMDGPU::V_LOG_F16_e32:
376 case AMDGPU::V_EXP_F16_e64:
377 case AMDGPU::V_EXP_F16_e32:
378 case AMDGPU::V_SIN_F16_e64:
379 case AMDGPU::V_SIN_F16_e32:
380 case AMDGPU::V_COS_F16_e64:
381 case AMDGPU::V_COS_F16_e32:
382 case AMDGPU::V_FLOOR_F16_e64:
383 case AMDGPU::V_FLOOR_F16_e32:
384 case AMDGPU::V_CEIL_F16_e64:
385 case AMDGPU::V_CEIL_F16_e32:
386 case AMDGPU::V_TRUNC_F16_e64:
387 case AMDGPU::V_TRUNC_F16_e32:
388 case AMDGPU::V_RNDNE_F16_e64:
389 case AMDGPU::V_RNDNE_F16_e32:
390 case AMDGPU::V_FRACT_F16_e64:
391 case AMDGPU::V_FRACT_F16_e32:
392 case AMDGPU::V_FREXP_MANT_F16_e64:
393 case AMDGPU::V_FREXP_MANT_F16_e32:
394 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
395 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
396 case AMDGPU::V_LDEXP_F16_e64:
397 case AMDGPU::V_LDEXP_F16_e32:
398 case AMDGPU::V_LSHLREV_B16_e64:
399 case AMDGPU::V_LSHLREV_B16_e32:
400 case AMDGPU::V_LSHRREV_B16_e64:
401 case AMDGPU::V_LSHRREV_B16_e32:
402 case AMDGPU::V_ASHRREV_I16_e64:
403 case AMDGPU::V_ASHRREV_I16_e32:
404 case AMDGPU::V_ADD_U16_e64:
405 case AMDGPU::V_ADD_U16_e32:
406 case AMDGPU::V_SUB_U16_e64:
407 case AMDGPU::V_SUB_U16_e32:
408 case AMDGPU::V_SUBREV_U16_e64:
409 case AMDGPU::V_SUBREV_U16_e32:
410 case AMDGPU::V_MUL_LO_U16_e64:
411 case AMDGPU::V_MUL_LO_U16_e32:
412 case AMDGPU::V_ADD_F16_e64:
413 case AMDGPU::V_ADD_F16_e32:
414 case AMDGPU::V_SUB_F16_e64:
415 case AMDGPU::V_SUB_F16_e32:
416 case AMDGPU::V_SUBREV_F16_e64:
417 case AMDGPU::V_SUBREV_F16_e32:
418 case AMDGPU::V_MUL_F16_e64:
419 case AMDGPU::V_MUL_F16_e32:
420 case AMDGPU::V_MAX_F16_e64:
421 case AMDGPU::V_MAX_F16_e32:
422 case AMDGPU::V_MIN_F16_e64:
423 case AMDGPU::V_MIN_F16_e32:
424 case AMDGPU::V_MAX_U16_e64:
425 case AMDGPU::V_MAX_U16_e32:
426 case AMDGPU::V_MIN_U16_e64:
427 case AMDGPU::V_MIN_U16_e32:
428 case AMDGPU::V_MAX_I16_e64:
429 case AMDGPU::V_MAX_I16_e32:
430 case AMDGPU::V_MIN_I16_e64:
431 case AMDGPU::V_MIN_I16_e32:
432 // On gfx10, all 16-bit instructions preserve the high bits.
433 return getGeneration() <= AMDGPUSubtarget::GFX9;
434 case AMDGPU::V_MAD_F16_e64:
435 case AMDGPU::V_MADAK_F16:
436 case AMDGPU::V_MADMK_F16:
437 case AMDGPU::V_MAC_F16_e64:
438 case AMDGPU::V_MAC_F16_e32:
439 case AMDGPU::V_FMAMK_F16:
440 case AMDGPU::V_FMAAK_F16:
441 case AMDGPU::V_MAD_U16_e64:
442 case AMDGPU::V_MAD_I16_e64:
443 case AMDGPU::V_FMA_F16_e64:
444 case AMDGPU::V_FMAC_F16_e64:
445 case AMDGPU::V_FMAC_F16_e32:
446 case AMDGPU::V_DIV_FIXUP_F16_e64:
447 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
448 // instructions maintain the legacy behavior of 0ing. Some instructions
449 // changed to preserving the high bits.
450 return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
451 case AMDGPU::V_MAD_MIXLO_F16:
452 case AMDGPU::V_MAD_MIXHI_F16:
453 default:
454 return false;
455 }
456 }
457
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const458 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
459 const Function &F) const {
460 if (NWaves == 1)
461 return getLocalMemorySize();
462 unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
463 unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
464 if (!WorkGroupsPerCu)
465 return 0;
466 unsigned MaxWaves = getMaxWavesPerEU();
467 return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
468 }
469
470 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const471 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
472 const Function &F) const {
473 const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
474 const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
475 if (!MaxWorkGroupsPerCu)
476 return 0;
477
478 const unsigned WaveSize = getWavefrontSize();
479
480 // FIXME: Do we need to account for alignment requirement of LDS rounding the
481 // size up?
482 // Compute restriction based on LDS usage
483 unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
484
485 // This can be queried with more LDS than is possible, so just assume the
486 // worst.
487 if (NumGroups == 0)
488 return 1;
489
490 NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
491
492 // Round to the number of waves.
493 const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
494 unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
495
496 // Clamp to the maximum possible number of waves.
497 MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
498
499 // FIXME: Needs to be a multiple of the group size?
500 //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
501
502 assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
503 "computed invalid occupancy");
504 return MaxWaves;
505 }
506
507 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const508 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
509 const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
510 return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
511 }
512
513 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const514 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
515 switch (CC) {
516 case CallingConv::AMDGPU_VS:
517 case CallingConv::AMDGPU_LS:
518 case CallingConv::AMDGPU_HS:
519 case CallingConv::AMDGPU_ES:
520 case CallingConv::AMDGPU_GS:
521 case CallingConv::AMDGPU_PS:
522 return std::make_pair(1, getWavefrontSize());
523 default:
524 return std::make_pair(1u, getMaxFlatWorkGroupSize());
525 }
526 }
527
getFlatWorkGroupSizes(const Function & F) const528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
529 const Function &F) const {
530 // Default minimum/maximum flat work group sizes.
531 std::pair<unsigned, unsigned> Default =
532 getDefaultFlatWorkGroupSize(F.getCallingConv());
533
534 // Requested minimum/maximum flat work group sizes.
535 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
536 F, "amdgpu-flat-work-group-size", Default);
537
538 // Make sure requested minimum is less than requested maximum.
539 if (Requested.first > Requested.second)
540 return Default;
541
542 // Make sure requested values do not violate subtarget's specifications.
543 if (Requested.first < getMinFlatWorkGroupSize())
544 return Default;
545 if (Requested.second > getMaxFlatWorkGroupSize())
546 return Default;
547
548 return Requested;
549 }
550
getWavesPerEU(const Function & F) const551 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
552 const Function &F) const {
553 // Default minimum/maximum number of waves per execution unit.
554 std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
555
556 // Default/requested minimum/maximum flat work group sizes.
557 std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
558
559 // If minimum/maximum flat work group sizes were explicitly requested using
560 // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
561 // number of waves per execution unit to values implied by requested
562 // minimum/maximum flat work group sizes.
563 unsigned MinImpliedByFlatWorkGroupSize =
564 getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
565 Default.first = MinImpliedByFlatWorkGroupSize;
566 bool RequestedFlatWorkGroupSize =
567 F.hasFnAttribute("amdgpu-flat-work-group-size");
568
569 // Requested minimum/maximum number of waves per execution unit.
570 std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
571 F, "amdgpu-waves-per-eu", Default, true);
572
573 // Make sure requested minimum is less than requested maximum.
574 if (Requested.second && Requested.first > Requested.second)
575 return Default;
576
577 // Make sure requested values do not violate subtarget's specifications.
578 if (Requested.first < getMinWavesPerEU() ||
579 Requested.second > getMaxWavesPerEU())
580 return Default;
581
582 // Make sure requested values are compatible with values implied by requested
583 // minimum/maximum flat work group sizes.
584 if (RequestedFlatWorkGroupSize &&
585 Requested.first < MinImpliedByFlatWorkGroupSize)
586 return Default;
587
588 return Requested;
589 }
590
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)591 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
592 auto Node = Kernel.getMetadata("reqd_work_group_size");
593 if (Node && Node->getNumOperands() == 3)
594 return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
595 return std::numeric_limits<unsigned>::max();
596 }
597
isMesaKernel(const Function & F) const598 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
599 return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
600 }
601
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const602 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
603 unsigned Dimension) const {
604 unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
605 if (ReqdSize != std::numeric_limits<unsigned>::max())
606 return ReqdSize - 1;
607 return getFlatWorkGroupSizes(Kernel).second - 1;
608 }
609
makeLIDRangeMetadata(Instruction * I) const610 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
611 Function *Kernel = I->getParent()->getParent();
612 unsigned MinSize = 0;
613 unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
614 bool IdQuery = false;
615
616 // If reqd_work_group_size is present it narrows value down.
617 if (auto *CI = dyn_cast<CallInst>(I)) {
618 const Function *F = CI->getCalledFunction();
619 if (F) {
620 unsigned Dim = UINT_MAX;
621 switch (F->getIntrinsicID()) {
622 case Intrinsic::amdgcn_workitem_id_x:
623 case Intrinsic::r600_read_tidig_x:
624 IdQuery = true;
625 LLVM_FALLTHROUGH;
626 case Intrinsic::r600_read_local_size_x:
627 Dim = 0;
628 break;
629 case Intrinsic::amdgcn_workitem_id_y:
630 case Intrinsic::r600_read_tidig_y:
631 IdQuery = true;
632 LLVM_FALLTHROUGH;
633 case Intrinsic::r600_read_local_size_y:
634 Dim = 1;
635 break;
636 case Intrinsic::amdgcn_workitem_id_z:
637 case Intrinsic::r600_read_tidig_z:
638 IdQuery = true;
639 LLVM_FALLTHROUGH;
640 case Intrinsic::r600_read_local_size_z:
641 Dim = 2;
642 break;
643 default:
644 break;
645 }
646
647 if (Dim <= 3) {
648 unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
649 if (ReqdSize != std::numeric_limits<unsigned>::max())
650 MinSize = MaxSize = ReqdSize;
651 }
652 }
653 }
654
655 if (!MaxSize)
656 return false;
657
658 // Range metadata is [Lo, Hi). For ID query we need to pass max size
659 // as Hi. For size query we need to pass Hi + 1.
660 if (IdQuery)
661 MinSize = 0;
662 else
663 ++MaxSize;
664
665 MDBuilder MDB(I->getContext());
666 MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
667 APInt(32, MaxSize));
668 I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
669 return true;
670 }
671
getImplicitArgNumBytes(const Function & F) const672 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
673 if (isMesaKernel(F))
674 return 16;
675 return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
676 }
677
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const678 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
679 Align &MaxAlign) const {
680 assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
681 F.getCallingConv() == CallingConv::SPIR_KERNEL);
682
683 const DataLayout &DL = F.getParent()->getDataLayout();
684 uint64_t ExplicitArgBytes = 0;
685 MaxAlign = Align(1);
686
687 for (const Argument &Arg : F.args()) {
688 const bool IsByRef = Arg.hasByRefAttr();
689 Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
690 MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
691 if (!Alignment)
692 Alignment = DL.getABITypeAlign(ArgTy);
693
694 uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
695 ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
696 MaxAlign = max(MaxAlign, Alignment);
697 }
698
699 return ExplicitArgBytes;
700 }
701
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const702 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
703 Align &MaxAlign) const {
704 uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
705
706 unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
707
708 uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
709 unsigned ImplicitBytes = getImplicitArgNumBytes(F);
710 if (ImplicitBytes != 0) {
711 const Align Alignment = getAlignmentForImplicitArgPtr();
712 TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
713 }
714
715 // Being able to dereference past the end is useful for emitting scalar loads.
716 return alignTo(TotalSize, 4);
717 }
718
getAMDGPUDwarfFlavour() const719 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
720 return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
721 : AMDGPUDwarfFlavour::Wave64;
722 }
723
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)724 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
725 const TargetMachine &TM) :
726 R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
727 AMDGPUSubtarget(TT),
728 InstrInfo(*this),
729 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
730 FMA(false),
731 CaymanISA(false),
732 CFALUBug(false),
733 HasVertexCache(false),
734 R600ALUInst(false),
735 FP64(false),
736 TexVTXClauseSize(0),
737 Gen(R600),
738 TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
739 InstrItins(getInstrItineraryForCPU(GPU)) { }
740
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const741 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
742 unsigned NumRegionInstrs) const {
743 // Track register pressure so the scheduler can try to decrease
744 // pressure once register usage is above the threshold defined by
745 // SIRegisterInfo::getRegPressureSetLimit()
746 Policy.ShouldTrackPressure = true;
747
748 // Enabling both top down and bottom up scheduling seems to give us less
749 // register spills than just using one of these approaches on its own.
750 Policy.OnlyTopDown = false;
751 Policy.OnlyBottomUp = false;
752
753 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
754 if (!enableSIScheduler())
755 Policy.ShouldTrackLaneMasks = true;
756 }
757
hasMadF16() const758 bool GCNSubtarget::hasMadF16() const {
759 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
760 }
761
useVGPRIndexMode() const762 bool GCNSubtarget::useVGPRIndexMode() const {
763 return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
764 }
765
useAA() const766 bool GCNSubtarget::useAA() const { return UseAA; }
767
getOccupancyWithNumSGPRs(unsigned SGPRs) const768 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
769 if (getGeneration() >= AMDGPUSubtarget::GFX10)
770 return getMaxWavesPerEU();
771
772 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
773 if (SGPRs <= 80)
774 return 10;
775 if (SGPRs <= 88)
776 return 9;
777 if (SGPRs <= 100)
778 return 8;
779 return 7;
780 }
781 if (SGPRs <= 48)
782 return 10;
783 if (SGPRs <= 56)
784 return 9;
785 if (SGPRs <= 64)
786 return 8;
787 if (SGPRs <= 72)
788 return 7;
789 if (SGPRs <= 80)
790 return 6;
791 return 5;
792 }
793
getOccupancyWithNumVGPRs(unsigned VGPRs) const794 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
795 unsigned MaxWaves = getMaxWavesPerEU();
796 unsigned Granule = getVGPRAllocGranule();
797 if (VGPRs < Granule)
798 return MaxWaves;
799 unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
800 return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
801 }
802
803 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const804 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
805 if (getGeneration() >= AMDGPUSubtarget::GFX10)
806 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
807
808 if (HasFlatScratchInit) {
809 if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
810 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
811 if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
812 return 4; // FLAT_SCRATCH, VCC (in that order).
813 }
814
815 if (isXNACKEnabled())
816 return 4; // XNACK, VCC (in that order).
817 return 2; // VCC.
818 }
819
getReservedNumSGPRs(const MachineFunction & MF) const820 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
821 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
822 return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
823 }
824
getReservedNumSGPRs(const Function & F) const825 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
826 // The logic to detect if the function has
827 // flat scratch init is slightly different than how
828 // SIMachineFunctionInfo constructor derives.
829 // We don't use amdgpu-calls, amdgpu-stack-objects
830 // attributes and isAmdHsaOrMesa here as it doesn't really matter.
831 // TODO: Outline this derivation logic and have just
832 // one common function in the backend to avoid duplication.
833 bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
834 bool FunctionHasFlatScratchInit = false;
835 if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
836 enableFlatScratch()) {
837 FunctionHasFlatScratchInit = true;
838 }
839 return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
840 }
841
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const842 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
843 unsigned NumSGPRs,
844 unsigned NumVGPRs) const {
845 unsigned Occupancy =
846 std::min(getMaxWavesPerEU(),
847 getOccupancyWithLocalMemSize(LDSSize, F));
848 if (NumSGPRs)
849 Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
850 if (NumVGPRs)
851 Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
852 return Occupancy;
853 }
854
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const855 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
856 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
857 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
858 // Compute maximum number of SGPRs function can use using default/requested
859 // minimum number of waves per execution unit.
860 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
861 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
862
863 // Check if maximum number of SGPRs was explicitly requested using
864 // "amdgpu-num-sgpr" attribute.
865 if (F.hasFnAttribute("amdgpu-num-sgpr")) {
866 unsigned Requested = AMDGPU::getIntegerAttribute(
867 F, "amdgpu-num-sgpr", MaxNumSGPRs);
868
869 // Make sure requested value does not violate subtarget's specifications.
870 if (Requested && (Requested <= ReservedNumSGPRs))
871 Requested = 0;
872
873 // If more SGPRs are required to support the input user/system SGPRs,
874 // increase to accommodate them.
875 //
876 // FIXME: This really ends up using the requested number of SGPRs + number
877 // of reserved special registers in total. Theoretically you could re-use
878 // the last input registers for these special registers, but this would
879 // require a lot of complexity to deal with the weird aliasing.
880 unsigned InputNumSGPRs = PreloadedSGPRs;
881 if (Requested && Requested < InputNumSGPRs)
882 Requested = InputNumSGPRs;
883
884 // Make sure requested value is compatible with values implied by
885 // default/requested minimum/maximum number of waves per execution unit.
886 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
887 Requested = 0;
888 if (WavesPerEU.second &&
889 Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
890 Requested = 0;
891
892 if (Requested)
893 MaxNumSGPRs = Requested;
894 }
895
896 if (hasSGPRInitBug())
897 MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
898
899 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
900 }
901
getMaxNumSGPRs(const MachineFunction & MF) const902 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
903 const Function &F = MF.getFunction();
904 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
905 return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
906 getReservedNumSGPRs(MF));
907 }
908
getMaxNumPreloadedSGPRs()909 static unsigned getMaxNumPreloadedSGPRs() {
910 // Max number of user SGPRs
911 unsigned MaxUserSGPRs = 4 + // private segment buffer
912 2 + // Dispatch ptr
913 2 + // queue ptr
914 2 + // kernel segment ptr
915 2 + // dispatch ID
916 2 + // flat scratch init
917 2; // Implicit buffer ptr
918 // Max number of system SGPRs
919 unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
920 1 + // WorkGroupIDY
921 1 + // WorkGroupIDZ
922 1 + // WorkGroupInfo
923 1; // private segment wave byte offset
924 return MaxUserSGPRs + MaxSystemSGPRs;
925 }
926
getMaxNumSGPRs(const Function & F) const927 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
928 return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
929 getReservedNumSGPRs(F));
930 }
931
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU) const932 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
933 const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
934 // Compute maximum number of VGPRs function can use using default/requested
935 // minimum number of waves per execution unit.
936 unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
937
938 // Check if maximum number of VGPRs was explicitly requested using
939 // "amdgpu-num-vgpr" attribute.
940 if (F.hasFnAttribute("amdgpu-num-vgpr")) {
941 unsigned Requested = AMDGPU::getIntegerAttribute(
942 F, "amdgpu-num-vgpr", MaxNumVGPRs);
943
944 if (hasGFX90AInsts())
945 Requested *= 2;
946
947 // Make sure requested value is compatible with values implied by
948 // default/requested minimum/maximum number of waves per execution unit.
949 if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
950 Requested = 0;
951 if (WavesPerEU.second &&
952 Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
953 Requested = 0;
954
955 if (Requested)
956 MaxNumVGPRs = Requested;
957 }
958
959 return MaxNumVGPRs;
960 }
961
getMaxNumVGPRs(const Function & F) const962 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
963 return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
964 }
965
getMaxNumVGPRs(const MachineFunction & MF) const966 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
967 const Function &F = MF.getFunction();
968 const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
969 return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
970 }
971
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const972 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
973 int UseOpIdx, SDep &Dep) const {
974 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
975 !Def->isInstr() || !Use->isInstr())
976 return;
977
978 MachineInstr *DefI = Def->getInstr();
979 MachineInstr *UseI = Use->getInstr();
980
981 if (DefI->isBundle()) {
982 const SIRegisterInfo *TRI = getRegisterInfo();
983 auto Reg = Dep.getReg();
984 MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
985 MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
986 unsigned Lat = 0;
987 for (++I; I != E && I->isBundledWithPred(); ++I) {
988 if (I->modifiesRegister(Reg, TRI))
989 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
990 else if (Lat)
991 --Lat;
992 }
993 Dep.setLatency(Lat);
994 } else if (UseI->isBundle()) {
995 const SIRegisterInfo *TRI = getRegisterInfo();
996 auto Reg = Dep.getReg();
997 MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
998 MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
999 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
1000 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
1001 if (I->readsRegister(Reg, TRI))
1002 break;
1003 --Lat;
1004 }
1005 Dep.setLatency(Lat);
1006 }
1007 }
1008
1009 namespace {
1010 struct FillMFMAShadowMutation : ScheduleDAGMutation {
1011 const SIInstrInfo *TII;
1012
1013 ScheduleDAGMI *DAG;
1014
FillMFMAShadowMutation__anon7ead501c0111::FillMFMAShadowMutation1015 FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
1016
isSALU__anon7ead501c0111::FillMFMAShadowMutation1017 bool isSALU(const SUnit *SU) const {
1018 const MachineInstr *MI = SU->getInstr();
1019 return MI && TII->isSALU(*MI) && !MI->isTerminator();
1020 }
1021
isVALU__anon7ead501c0111::FillMFMAShadowMutation1022 bool isVALU(const SUnit *SU) const {
1023 const MachineInstr *MI = SU->getInstr();
1024 return MI && TII->isVALU(*MI);
1025 }
1026
canAddEdge__anon7ead501c0111::FillMFMAShadowMutation1027 bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1028 if (Pred->NodeNum < Succ->NodeNum)
1029 return true;
1030
1031 SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1032
1033 for (unsigned I = 0; I < Succs.size(); ++I) {
1034 for (const SDep &SI : Succs[I]->Succs) {
1035 const SUnit *SU = SI.getSUnit();
1036 if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1037 Succs.push_back(SU);
1038 }
1039 }
1040
1041 SmallPtrSet<const SUnit*, 32> Visited;
1042 while (!Preds.empty()) {
1043 const SUnit *SU = Preds.pop_back_val();
1044 if (llvm::is_contained(Succs, SU))
1045 return false;
1046 Visited.insert(SU);
1047 for (const SDep &SI : SU->Preds)
1048 if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1049 Preds.push_back(SI.getSUnit());
1050 }
1051
1052 return true;
1053 }
1054
1055 // Link as much SALU intructions in chain as possible. Return the size
1056 // of the chain. Links up to MaxChain instructions.
linkSALUChain__anon7ead501c0111::FillMFMAShadowMutation1057 unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1058 SmallPtrSetImpl<SUnit *> &Visited) const {
1059 SmallVector<SUnit *, 8> Worklist({To});
1060 unsigned Linked = 0;
1061
1062 while (!Worklist.empty() && MaxChain-- > 0) {
1063 SUnit *SU = Worklist.pop_back_val();
1064 if (!Visited.insert(SU).second)
1065 continue;
1066
1067 LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1068 dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1069
1070 if (SU->addPred(SDep(From, SDep::Artificial), false))
1071 ++Linked;
1072
1073 for (SDep &SI : From->Succs) {
1074 SUnit *SUv = SI.getSUnit();
1075 if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1076 SUv->addPred(SDep(SU, SDep::Artificial), false);
1077 }
1078
1079 for (SDep &SI : SU->Succs) {
1080 SUnit *Succ = SI.getSUnit();
1081 if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1082 Worklist.push_back(Succ);
1083 }
1084 }
1085
1086 return Linked;
1087 }
1088
apply__anon7ead501c0111::FillMFMAShadowMutation1089 void apply(ScheduleDAGInstrs *DAGInstrs) override {
1090 const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1091 if (!ST.hasMAIInsts() || DisablePowerSched)
1092 return;
1093 DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1094 const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1095 if (!TSchedModel || DAG->SUnits.empty())
1096 return;
1097
1098 // Scan for MFMA long latency instructions and try to add a dependency
1099 // of available SALU instructions to give them a chance to fill MFMA
1100 // shadow. That is desirable to fill MFMA shadow with SALU instructions
1101 // rather than VALU to prevent power consumption bursts and throttle.
1102 auto LastSALU = DAG->SUnits.begin();
1103 auto E = DAG->SUnits.end();
1104 SmallPtrSet<SUnit*, 32> Visited;
1105 for (SUnit &SU : DAG->SUnits) {
1106 MachineInstr &MAI = *SU.getInstr();
1107 if (!TII->isMAI(MAI) ||
1108 MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1109 MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1110 continue;
1111
1112 unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1113
1114 LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1115 dbgs() << "Need " << Lat
1116 << " instructions to cover latency.\n");
1117
1118 // Find up to Lat independent scalar instructions as early as
1119 // possible such that they can be scheduled after this MFMA.
1120 for ( ; Lat && LastSALU != E; ++LastSALU) {
1121 if (Visited.count(&*LastSALU))
1122 continue;
1123
1124 if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1125 continue;
1126
1127 Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1128 }
1129 }
1130 }
1131 };
1132 } // namespace
1133
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const1134 void GCNSubtarget::getPostRAMutations(
1135 std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1136 Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1137 }
1138
get(const MachineFunction & MF)1139 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1140 if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1141 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1142 else
1143 return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1144 }
1145
get(const TargetMachine & TM,const Function & F)1146 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1147 if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1148 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1149 else
1150 return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1151 }
1152