1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUCallLowering.h"
17 #include "AMDGPUInstructionSelector.h"
18 #include "AMDGPULegalizerInfo.h"
19 #include "AMDGPURegisterBankInfo.h"
20 #include "AMDGPUTargetMachine.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "Utils/AMDGPUBaseInfo.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
25 #include "llvm/CodeGen/MachineScheduler.h"
26 #include "llvm/CodeGen/TargetFrameLowering.h"
27 #include "llvm/IR/IntrinsicsAMDGPU.h"
28 #include "llvm/IR/IntrinsicsR600.h"
29 #include "llvm/IR/MDBuilder.h"
30 #include "llvm/MC/MCSubtargetInfo.h"
31 #include <algorithm>
32 
33 using namespace llvm;
34 
35 #define DEBUG_TYPE "amdgpu-subtarget"
36 
37 #define GET_SUBTARGETINFO_TARGET_DESC
38 #define GET_SUBTARGETINFO_CTOR
39 #define AMDGPUSubtarget GCNSubtarget
40 #include "AMDGPUGenSubtargetInfo.inc"
41 #define GET_SUBTARGETINFO_TARGET_DESC
42 #define GET_SUBTARGETINFO_CTOR
43 #undef AMDGPUSubtarget
44 #include "R600GenSubtargetInfo.inc"
45 
46 static cl::opt<bool> DisablePowerSched(
47   "amdgpu-disable-power-sched",
48   cl::desc("Disable scheduling to minimize mAI power bursts"),
49   cl::init(false));
50 
51 static cl::opt<bool> EnableVGPRIndexMode(
52   "amdgpu-vgpr-index-mode",
53   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
54   cl::init(false));
55 
56 static cl::opt<bool> EnableFlatScratch(
57   "amdgpu-enable-flat-scratch",
58   cl::desc("Use flat scratch instructions"),
59   cl::init(false));
60 
61 static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
62                            cl::desc("Enable the use of AA during codegen."),
63                            cl::init(true));
64 
65 GCNSubtarget::~GCNSubtarget() = default;
66 
67 R600Subtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)68 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
69                                                StringRef GPU, StringRef FS) {
70   SmallString<256> FullFS("+promote-alloca,");
71   FullFS += FS;
72   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
73 
74   HasMulU24 = getGeneration() >= EVERGREEN;
75   HasMulI24 = hasCaymanISA();
76 
77   return *this;
78 }
79 
80 GCNSubtarget &
initializeSubtargetDependencies(const Triple & TT,StringRef GPU,StringRef FS)81 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
82                                               StringRef GPU, StringRef FS) {
83   // Determine default and user-specified characteristics
84   //
85   // We want to be able to turn these off, but making this a subtarget feature
86   // for SI has the unhelpful behavior that it unsets everything else if you
87   // disable it.
88   //
89   // Similarly we want enable-prt-strict-null to be on by default and not to
90   // unset everything else if it is disabled
91 
92   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,");
93 
94   // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default
95   if (isAmdHsaOS())
96     FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
97 
98   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
99 
100   // Disable mutually exclusive bits.
101   if (FS.find_insensitive("+wavefrontsize") != StringRef::npos) {
102     if (FS.find_insensitive("wavefrontsize16") == StringRef::npos)
103       FullFS += "-wavefrontsize16,";
104     if (FS.find_insensitive("wavefrontsize32") == StringRef::npos)
105       FullFS += "-wavefrontsize32,";
106     if (FS.find_insensitive("wavefrontsize64") == StringRef::npos)
107       FullFS += "-wavefrontsize64,";
108   }
109 
110   FullFS += FS;
111 
112   ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
113 
114   // Implement the "generic" processors, which acts as the default when no
115   // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
116   // the first amdgcn target that supports flat addressing. Other OSes defaults
117   // to the first amdgcn target.
118   if (Gen == AMDGPUSubtarget::INVALID) {
119      Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS
120                                         : AMDGPUSubtarget::SOUTHERN_ISLANDS;
121   }
122 
123   // We don't support FP64 for EG/NI atm.
124   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
125 
126   // Targets must either support 64-bit offsets for MUBUF instructions, and/or
127   // support flat operations, otherwise they cannot access a 64-bit global
128   // address space
129   assert(hasAddr64() || hasFlat());
130   // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
131   // that do not support ADDR64 variants of MUBUF instructions. Such targets
132   // cannot use a 64 bit offset with a MUBUF instruction to access the global
133   // address space
134   if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) {
135     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
136     FlatForGlobal = true;
137   }
138   // Unless +-flat-for-global is specified, use MUBUF instructions for global
139   // address space access if flat operations are not available.
140   if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) {
141     ToggleFeature(AMDGPU::FeatureFlatForGlobal);
142     FlatForGlobal = false;
143   }
144 
145   // Set defaults if needed.
146   if (MaxPrivateElementSize == 0)
147     MaxPrivateElementSize = 4;
148 
149   if (LDSBankCount == 0)
150     LDSBankCount = 32;
151 
152   if (TT.getArch() == Triple::amdgcn) {
153     if (LocalMemorySize == 0)
154       LocalMemorySize = 32768;
155 
156     // Do something sensible for unspecified target.
157     if (!HasMovrel && !HasVGPRIndexMode)
158       HasMovrel = true;
159   }
160 
161   // Don't crash on invalid devices.
162   if (WavefrontSizeLog2 == 0)
163     WavefrontSizeLog2 = 5;
164 
165   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
166   HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9;
167 
168   TargetID.setTargetIDFromFeaturesString(FS);
169 
170   LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
171                     << TargetID.getXnackSetting() << '\n');
172   LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
173                     << TargetID.getSramEccSetting() << '\n');
174 
175   return *this;
176 }
177 
AMDGPUSubtarget(const Triple & TT)178 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
179   TargetTriple(TT),
180   GCN3Encoding(false),
181   Has16BitInsts(false),
182   HasMadMixInsts(false),
183   HasMadMacF32Insts(false),
184   HasDsSrc2Insts(false),
185   HasSDWA(false),
186   HasVOP3PInsts(false),
187   HasMulI24(true),
188   HasMulU24(true),
189   HasSMulHi(false),
190   HasInv2PiInlineImm(false),
191   HasFminFmaxLegacy(true),
192   EnablePromoteAlloca(false),
193   HasTrigReducedRange(false),
194   MaxWavesPerEU(10),
195   LocalMemorySize(0),
196   WavefrontSizeLog2(0)
197   { }
198 
GCNSubtarget(const Triple & TT,StringRef GPU,StringRef FS,const GCNTargetMachine & TM)199 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
200                            const GCNTargetMachine &TM)
201     : // clang-format off
202     AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
203     AMDGPUSubtarget(TT),
204     TargetTriple(TT),
205     TargetID(*this),
206     Gen(INVALID),
207     InstrItins(getInstrItineraryForCPU(GPU)),
208     LDSBankCount(0),
209     MaxPrivateElementSize(0),
210 
211     FastFMAF32(false),
212     FastDenormalF32(false),
213     HalfRate64Ops(false),
214     FullRate64Ops(false),
215 
216     FlatForGlobal(false),
217     AutoWaitcntBeforeBarrier(false),
218     UnalignedScratchAccess(false),
219     UnalignedAccessMode(false),
220 
221     HasApertureRegs(false),
222     SupportsXNACK(false),
223     EnableXNACK(false),
224     EnableTgSplit(false),
225     EnableCuMode(false),
226     TrapHandler(false),
227 
228     EnableLoadStoreOpt(false),
229     EnableUnsafeDSOffsetFolding(false),
230     EnableSIScheduler(false),
231     EnableDS128(false),
232     EnablePRTStrictNull(false),
233     DumpCode(false),
234 
235     FP64(false),
236     CIInsts(false),
237     GFX8Insts(false),
238     GFX9Insts(false),
239     GFX90AInsts(false),
240     GFX10Insts(false),
241     GFX10_3Insts(false),
242     GFX7GFX8GFX9Insts(false),
243     SGPRInitBug(false),
244     NegativeScratchOffsetBug(false),
245     NegativeUnalignedScratchOffsetBug(false),
246     HasSMemRealTime(false),
247     HasIntClamp(false),
248     HasFmaMixInsts(false),
249     HasMovrel(false),
250     HasVGPRIndexMode(false),
251     HasScalarStores(false),
252     HasScalarAtomics(false),
253     HasSDWAOmod(false),
254     HasSDWAScalar(false),
255     HasSDWASdst(false),
256     HasSDWAMac(false),
257     HasSDWAOutModsVOPC(false),
258     HasDPP(false),
259     HasDPP8(false),
260     Has64BitDPP(false),
261     HasPackedFP32Ops(false),
262     HasExtendedImageInsts(false),
263     HasR128A16(false),
264     HasGFX10A16(false),
265     HasG16(false),
266     HasNSAEncoding(false),
267     NSAMaxSize(0),
268     GFX10_AEncoding(false),
269     GFX10_BEncoding(false),
270     HasDLInsts(false),
271     HasDot1Insts(false),
272     HasDot2Insts(false),
273     HasDot3Insts(false),
274     HasDot4Insts(false),
275     HasDot5Insts(false),
276     HasDot6Insts(false),
277     HasDot7Insts(false),
278     HasMAIInsts(false),
279     HasPkFmacF16Inst(false),
280     HasAtomicFaddInsts(false),
281     SupportsSRAMECC(false),
282     EnableSRAMECC(false),
283     HasNoSdstCMPX(false),
284     HasVscnt(false),
285     HasGetWaveIdInst(false),
286     HasSMemTimeInst(false),
287     HasShaderCyclesRegister(false),
288     HasRegisterBanking(false),
289     HasVOP3Literal(false),
290     HasNoDataDepHazard(false),
291     FlatAddressSpace(false),
292     FlatInstOffsets(false),
293     FlatGlobalInsts(false),
294     FlatScratchInsts(false),
295     ScalarFlatScratchInsts(false),
296     HasArchitectedFlatScratch(false),
297     AddNoCarryInsts(false),
298     HasUnpackedD16VMem(false),
299     LDSMisalignedBug(false),
300     HasMFMAInlineLiteralBug(false),
301     UnalignedBufferAccess(false),
302     UnalignedDSAccess(false),
303     HasPackedTID(false),
304 
305     ScalarizeGlobal(false),
306 
307     HasVcmpxPermlaneHazard(false),
308     HasVMEMtoScalarWriteHazard(false),
309     HasSMEMtoVectorWriteHazard(false),
310     HasInstFwdPrefetchBug(false),
311     HasVcmpxExecWARHazard(false),
312     HasLdsBranchVmemWARHazard(false),
313     HasNSAtoVMEMBug(false),
314     HasNSAClauseBug(false),
315     HasOffset3fBug(false),
316     HasFlatSegmentOffsetBug(false),
317     HasImageStoreD16Bug(false),
318     HasImageGather4D16Bug(false),
319 
320     FeatureDisable(false),
321     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
322     TLInfo(TM, *this),
323     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
324   // clang-format on
325   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
326   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
327   InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
328   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
329   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
330   InstSelector.reset(new AMDGPUInstructionSelector(
331   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
332 }
333 
enableFlatScratch() const334 bool GCNSubtarget::enableFlatScratch() const {
335   return flatScratchIsArchitected() ||
336          (EnableFlatScratch && hasFlatScratchInsts());
337 }
338 
getConstantBusLimit(unsigned Opcode) const339 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
340   if (getGeneration() < GFX10)
341     return 1;
342 
343   switch (Opcode) {
344   case AMDGPU::V_LSHLREV_B64_e64:
345   case AMDGPU::V_LSHLREV_B64_gfx10:
346   case AMDGPU::V_LSHL_B64_e64:
347   case AMDGPU::V_LSHRREV_B64_e64:
348   case AMDGPU::V_LSHRREV_B64_gfx10:
349   case AMDGPU::V_LSHR_B64_e64:
350   case AMDGPU::V_ASHRREV_I64_e64:
351   case AMDGPU::V_ASHRREV_I64_gfx10:
352   case AMDGPU::V_ASHR_I64_e64:
353     return 1;
354   }
355 
356   return 2;
357 }
358 
359 /// This list was mostly derived from experimentation.
zeroesHigh16BitsOfDest(unsigned Opcode) const360 bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
361   switch (Opcode) {
362   case AMDGPU::V_CVT_F16_F32_e32:
363   case AMDGPU::V_CVT_F16_F32_e64:
364   case AMDGPU::V_CVT_F16_U16_e32:
365   case AMDGPU::V_CVT_F16_U16_e64:
366   case AMDGPU::V_CVT_F16_I16_e32:
367   case AMDGPU::V_CVT_F16_I16_e64:
368   case AMDGPU::V_RCP_F16_e64:
369   case AMDGPU::V_RCP_F16_e32:
370   case AMDGPU::V_RSQ_F16_e64:
371   case AMDGPU::V_RSQ_F16_e32:
372   case AMDGPU::V_SQRT_F16_e64:
373   case AMDGPU::V_SQRT_F16_e32:
374   case AMDGPU::V_LOG_F16_e64:
375   case AMDGPU::V_LOG_F16_e32:
376   case AMDGPU::V_EXP_F16_e64:
377   case AMDGPU::V_EXP_F16_e32:
378   case AMDGPU::V_SIN_F16_e64:
379   case AMDGPU::V_SIN_F16_e32:
380   case AMDGPU::V_COS_F16_e64:
381   case AMDGPU::V_COS_F16_e32:
382   case AMDGPU::V_FLOOR_F16_e64:
383   case AMDGPU::V_FLOOR_F16_e32:
384   case AMDGPU::V_CEIL_F16_e64:
385   case AMDGPU::V_CEIL_F16_e32:
386   case AMDGPU::V_TRUNC_F16_e64:
387   case AMDGPU::V_TRUNC_F16_e32:
388   case AMDGPU::V_RNDNE_F16_e64:
389   case AMDGPU::V_RNDNE_F16_e32:
390   case AMDGPU::V_FRACT_F16_e64:
391   case AMDGPU::V_FRACT_F16_e32:
392   case AMDGPU::V_FREXP_MANT_F16_e64:
393   case AMDGPU::V_FREXP_MANT_F16_e32:
394   case AMDGPU::V_FREXP_EXP_I16_F16_e64:
395   case AMDGPU::V_FREXP_EXP_I16_F16_e32:
396   case AMDGPU::V_LDEXP_F16_e64:
397   case AMDGPU::V_LDEXP_F16_e32:
398   case AMDGPU::V_LSHLREV_B16_e64:
399   case AMDGPU::V_LSHLREV_B16_e32:
400   case AMDGPU::V_LSHRREV_B16_e64:
401   case AMDGPU::V_LSHRREV_B16_e32:
402   case AMDGPU::V_ASHRREV_I16_e64:
403   case AMDGPU::V_ASHRREV_I16_e32:
404   case AMDGPU::V_ADD_U16_e64:
405   case AMDGPU::V_ADD_U16_e32:
406   case AMDGPU::V_SUB_U16_e64:
407   case AMDGPU::V_SUB_U16_e32:
408   case AMDGPU::V_SUBREV_U16_e64:
409   case AMDGPU::V_SUBREV_U16_e32:
410   case AMDGPU::V_MUL_LO_U16_e64:
411   case AMDGPU::V_MUL_LO_U16_e32:
412   case AMDGPU::V_ADD_F16_e64:
413   case AMDGPU::V_ADD_F16_e32:
414   case AMDGPU::V_SUB_F16_e64:
415   case AMDGPU::V_SUB_F16_e32:
416   case AMDGPU::V_SUBREV_F16_e64:
417   case AMDGPU::V_SUBREV_F16_e32:
418   case AMDGPU::V_MUL_F16_e64:
419   case AMDGPU::V_MUL_F16_e32:
420   case AMDGPU::V_MAX_F16_e64:
421   case AMDGPU::V_MAX_F16_e32:
422   case AMDGPU::V_MIN_F16_e64:
423   case AMDGPU::V_MIN_F16_e32:
424   case AMDGPU::V_MAX_U16_e64:
425   case AMDGPU::V_MAX_U16_e32:
426   case AMDGPU::V_MIN_U16_e64:
427   case AMDGPU::V_MIN_U16_e32:
428   case AMDGPU::V_MAX_I16_e64:
429   case AMDGPU::V_MAX_I16_e32:
430   case AMDGPU::V_MIN_I16_e64:
431   case AMDGPU::V_MIN_I16_e32:
432     // On gfx10, all 16-bit instructions preserve the high bits.
433     return getGeneration() <= AMDGPUSubtarget::GFX9;
434   case AMDGPU::V_MAD_F16_e64:
435   case AMDGPU::V_MADAK_F16:
436   case AMDGPU::V_MADMK_F16:
437   case AMDGPU::V_MAC_F16_e64:
438   case AMDGPU::V_MAC_F16_e32:
439   case AMDGPU::V_FMAMK_F16:
440   case AMDGPU::V_FMAAK_F16:
441   case AMDGPU::V_MAD_U16_e64:
442   case AMDGPU::V_MAD_I16_e64:
443   case AMDGPU::V_FMA_F16_e64:
444   case AMDGPU::V_FMAC_F16_e64:
445   case AMDGPU::V_FMAC_F16_e32:
446   case AMDGPU::V_DIV_FIXUP_F16_e64:
447     // In gfx9, the preferred handling of the unused high 16-bits changed. Most
448     // instructions maintain the legacy behavior of 0ing. Some instructions
449     // changed to preserving the high bits.
450     return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS;
451   case AMDGPU::V_MAD_MIXLO_F16:
452   case AMDGPU::V_MAD_MIXHI_F16:
453   default:
454     return false;
455   }
456 }
457 
getMaxLocalMemSizeWithWaveCount(unsigned NWaves,const Function & F) const458 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
459   const Function &F) const {
460   if (NWaves == 1)
461     return getLocalMemorySize();
462   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
463   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
464   if (!WorkGroupsPerCu)
465     return 0;
466   unsigned MaxWaves = getMaxWavesPerEU();
467   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
468 }
469 
470 // FIXME: Should return min,max range.
getOccupancyWithLocalMemSize(uint32_t Bytes,const Function & F) const471 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
472   const Function &F) const {
473   const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
474   const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
475   if (!MaxWorkGroupsPerCu)
476     return 0;
477 
478   const unsigned WaveSize = getWavefrontSize();
479 
480   // FIXME: Do we need to account for alignment requirement of LDS rounding the
481   // size up?
482   // Compute restriction based on LDS usage
483   unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
484 
485   // This can be queried with more LDS than is possible, so just assume the
486   // worst.
487   if (NumGroups == 0)
488     return 1;
489 
490   NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
491 
492   // Round to the number of waves.
493   const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
494   unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
495 
496   // Clamp to the maximum possible number of waves.
497   MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
498 
499   // FIXME: Needs to be a multiple of the group size?
500   //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
501 
502   assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
503          "computed invalid occupancy");
504   return MaxWaves;
505 }
506 
507 unsigned
getOccupancyWithLocalMemSize(const MachineFunction & MF) const508 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
509   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
510   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
511 }
512 
513 std::pair<unsigned, unsigned>
getDefaultFlatWorkGroupSize(CallingConv::ID CC) const514 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
515   switch (CC) {
516   case CallingConv::AMDGPU_VS:
517   case CallingConv::AMDGPU_LS:
518   case CallingConv::AMDGPU_HS:
519   case CallingConv::AMDGPU_ES:
520   case CallingConv::AMDGPU_GS:
521   case CallingConv::AMDGPU_PS:
522     return std::make_pair(1, getWavefrontSize());
523   default:
524     return std::make_pair(1u, getMaxFlatWorkGroupSize());
525   }
526 }
527 
getFlatWorkGroupSizes(const Function & F) const528 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
529   const Function &F) const {
530   // Default minimum/maximum flat work group sizes.
531   std::pair<unsigned, unsigned> Default =
532     getDefaultFlatWorkGroupSize(F.getCallingConv());
533 
534   // Requested minimum/maximum flat work group sizes.
535   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
536     F, "amdgpu-flat-work-group-size", Default);
537 
538   // Make sure requested minimum is less than requested maximum.
539   if (Requested.first > Requested.second)
540     return Default;
541 
542   // Make sure requested values do not violate subtarget's specifications.
543   if (Requested.first < getMinFlatWorkGroupSize())
544     return Default;
545   if (Requested.second > getMaxFlatWorkGroupSize())
546     return Default;
547 
548   return Requested;
549 }
550 
getWavesPerEU(const Function & F) const551 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
552   const Function &F) const {
553   // Default minimum/maximum number of waves per execution unit.
554   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
555 
556   // Default/requested minimum/maximum flat work group sizes.
557   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
558 
559   // If minimum/maximum flat work group sizes were explicitly requested using
560   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
561   // number of waves per execution unit to values implied by requested
562   // minimum/maximum flat work group sizes.
563   unsigned MinImpliedByFlatWorkGroupSize =
564     getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
565   Default.first = MinImpliedByFlatWorkGroupSize;
566   bool RequestedFlatWorkGroupSize =
567       F.hasFnAttribute("amdgpu-flat-work-group-size");
568 
569   // Requested minimum/maximum number of waves per execution unit.
570   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
571     F, "amdgpu-waves-per-eu", Default, true);
572 
573   // Make sure requested minimum is less than requested maximum.
574   if (Requested.second && Requested.first > Requested.second)
575     return Default;
576 
577   // Make sure requested values do not violate subtarget's specifications.
578   if (Requested.first < getMinWavesPerEU() ||
579       Requested.second > getMaxWavesPerEU())
580     return Default;
581 
582   // Make sure requested values are compatible with values implied by requested
583   // minimum/maximum flat work group sizes.
584   if (RequestedFlatWorkGroupSize &&
585       Requested.first < MinImpliedByFlatWorkGroupSize)
586     return Default;
587 
588   return Requested;
589 }
590 
getReqdWorkGroupSize(const Function & Kernel,unsigned Dim)591 static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
592   auto Node = Kernel.getMetadata("reqd_work_group_size");
593   if (Node && Node->getNumOperands() == 3)
594     return mdconst::extract<ConstantInt>(Node->getOperand(Dim))->getZExtValue();
595   return std::numeric_limits<unsigned>::max();
596 }
597 
isMesaKernel(const Function & F) const598 bool AMDGPUSubtarget::isMesaKernel(const Function &F) const {
599   return isMesa3DOS() && !AMDGPU::isShader(F.getCallingConv());
600 }
601 
getMaxWorkitemID(const Function & Kernel,unsigned Dimension) const602 unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
603                                            unsigned Dimension) const {
604   unsigned ReqdSize = getReqdWorkGroupSize(Kernel, Dimension);
605   if (ReqdSize != std::numeric_limits<unsigned>::max())
606     return ReqdSize - 1;
607   return getFlatWorkGroupSizes(Kernel).second - 1;
608 }
609 
makeLIDRangeMetadata(Instruction * I) const610 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
611   Function *Kernel = I->getParent()->getParent();
612   unsigned MinSize = 0;
613   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
614   bool IdQuery = false;
615 
616   // If reqd_work_group_size is present it narrows value down.
617   if (auto *CI = dyn_cast<CallInst>(I)) {
618     const Function *F = CI->getCalledFunction();
619     if (F) {
620       unsigned Dim = UINT_MAX;
621       switch (F->getIntrinsicID()) {
622       case Intrinsic::amdgcn_workitem_id_x:
623       case Intrinsic::r600_read_tidig_x:
624         IdQuery = true;
625         LLVM_FALLTHROUGH;
626       case Intrinsic::r600_read_local_size_x:
627         Dim = 0;
628         break;
629       case Intrinsic::amdgcn_workitem_id_y:
630       case Intrinsic::r600_read_tidig_y:
631         IdQuery = true;
632         LLVM_FALLTHROUGH;
633       case Intrinsic::r600_read_local_size_y:
634         Dim = 1;
635         break;
636       case Intrinsic::amdgcn_workitem_id_z:
637       case Intrinsic::r600_read_tidig_z:
638         IdQuery = true;
639         LLVM_FALLTHROUGH;
640       case Intrinsic::r600_read_local_size_z:
641         Dim = 2;
642         break;
643       default:
644         break;
645       }
646 
647       if (Dim <= 3) {
648         unsigned ReqdSize = getReqdWorkGroupSize(*Kernel, Dim);
649         if (ReqdSize != std::numeric_limits<unsigned>::max())
650           MinSize = MaxSize = ReqdSize;
651       }
652     }
653   }
654 
655   if (!MaxSize)
656     return false;
657 
658   // Range metadata is [Lo, Hi). For ID query we need to pass max size
659   // as Hi. For size query we need to pass Hi + 1.
660   if (IdQuery)
661     MinSize = 0;
662   else
663     ++MaxSize;
664 
665   MDBuilder MDB(I->getContext());
666   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
667                                                   APInt(32, MaxSize));
668   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
669   return true;
670 }
671 
getImplicitArgNumBytes(const Function & F) const672 unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
673   if (isMesaKernel(F))
674     return 16;
675   return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
676 }
677 
getExplicitKernArgSize(const Function & F,Align & MaxAlign) const678 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
679                                                  Align &MaxAlign) const {
680   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
681          F.getCallingConv() == CallingConv::SPIR_KERNEL);
682 
683   const DataLayout &DL = F.getParent()->getDataLayout();
684   uint64_t ExplicitArgBytes = 0;
685   MaxAlign = Align(1);
686 
687   for (const Argument &Arg : F.args()) {
688     const bool IsByRef = Arg.hasByRefAttr();
689     Type *ArgTy = IsByRef ? Arg.getParamByRefType() : Arg.getType();
690     MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None;
691     if (!Alignment)
692       Alignment = DL.getABITypeAlign(ArgTy);
693 
694     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
695     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
696     MaxAlign = max(MaxAlign, Alignment);
697   }
698 
699   return ExplicitArgBytes;
700 }
701 
getKernArgSegmentSize(const Function & F,Align & MaxAlign) const702 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
703                                                 Align &MaxAlign) const {
704   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
705 
706   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
707 
708   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
709   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
710   if (ImplicitBytes != 0) {
711     const Align Alignment = getAlignmentForImplicitArgPtr();
712     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
713   }
714 
715   // Being able to dereference past the end is useful for emitting scalar loads.
716   return alignTo(TotalSize, 4);
717 }
718 
getAMDGPUDwarfFlavour() const719 AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const {
720   return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
721                                   : AMDGPUDwarfFlavour::Wave64;
722 }
723 
R600Subtarget(const Triple & TT,StringRef GPU,StringRef FS,const TargetMachine & TM)724 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
725                              const TargetMachine &TM) :
726   R600GenSubtargetInfo(TT, GPU, /*TuneCPU*/GPU, FS),
727   AMDGPUSubtarget(TT),
728   InstrInfo(*this),
729   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
730   FMA(false),
731   CaymanISA(false),
732   CFALUBug(false),
733   HasVertexCache(false),
734   R600ALUInst(false),
735   FP64(false),
736   TexVTXClauseSize(0),
737   Gen(R600),
738   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
739   InstrItins(getInstrItineraryForCPU(GPU)) { }
740 
overrideSchedPolicy(MachineSchedPolicy & Policy,unsigned NumRegionInstrs) const741 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
742                                       unsigned NumRegionInstrs) const {
743   // Track register pressure so the scheduler can try to decrease
744   // pressure once register usage is above the threshold defined by
745   // SIRegisterInfo::getRegPressureSetLimit()
746   Policy.ShouldTrackPressure = true;
747 
748   // Enabling both top down and bottom up scheduling seems to give us less
749   // register spills than just using one of these approaches on its own.
750   Policy.OnlyTopDown = false;
751   Policy.OnlyBottomUp = false;
752 
753   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
754   if (!enableSIScheduler())
755     Policy.ShouldTrackLaneMasks = true;
756 }
757 
hasMadF16() const758 bool GCNSubtarget::hasMadF16() const {
759   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
760 }
761 
useVGPRIndexMode() const762 bool GCNSubtarget::useVGPRIndexMode() const {
763   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
764 }
765 
useAA() const766 bool GCNSubtarget::useAA() const { return UseAA; }
767 
getOccupancyWithNumSGPRs(unsigned SGPRs) const768 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
769   if (getGeneration() >= AMDGPUSubtarget::GFX10)
770     return getMaxWavesPerEU();
771 
772   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
773     if (SGPRs <= 80)
774       return 10;
775     if (SGPRs <= 88)
776       return 9;
777     if (SGPRs <= 100)
778       return 8;
779     return 7;
780   }
781   if (SGPRs <= 48)
782     return 10;
783   if (SGPRs <= 56)
784     return 9;
785   if (SGPRs <= 64)
786     return 8;
787   if (SGPRs <= 72)
788     return 7;
789   if (SGPRs <= 80)
790     return 6;
791   return 5;
792 }
793 
getOccupancyWithNumVGPRs(unsigned VGPRs) const794 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
795   unsigned MaxWaves = getMaxWavesPerEU();
796   unsigned Granule = getVGPRAllocGranule();
797   if (VGPRs < Granule)
798     return MaxWaves;
799   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
800   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
801 }
802 
803 unsigned
getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const804 GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratchInit) const {
805   if (getGeneration() >= AMDGPUSubtarget::GFX10)
806     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
807 
808   if (HasFlatScratchInit) {
809     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
810       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
811     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
812       return 4; // FLAT_SCRATCH, VCC (in that order).
813   }
814 
815   if (isXNACKEnabled())
816     return 4; // XNACK, VCC (in that order).
817   return 2; // VCC.
818 }
819 
getReservedNumSGPRs(const MachineFunction & MF) const820 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
821   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
822   return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
823 }
824 
getReservedNumSGPRs(const Function & F) const825 unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
826   // The logic to detect if the function has
827   // flat scratch init is slightly different than how
828   // SIMachineFunctionInfo constructor derives.
829   // We don't use amdgpu-calls, amdgpu-stack-objects
830   // attributes and isAmdHsaOrMesa here as it doesn't really matter.
831   // TODO: Outline this derivation logic and have just
832   // one common function in the backend to avoid duplication.
833   bool isEntry = AMDGPU::isEntryFunctionCC(F.getCallingConv());
834   bool FunctionHasFlatScratchInit = false;
835   if (hasFlatAddressSpace() && isEntry && !flatScratchIsArchitected() &&
836       enableFlatScratch()) {
837     FunctionHasFlatScratchInit = true;
838   }
839   return getBaseReservedNumSGPRs(FunctionHasFlatScratchInit);
840 }
841 
computeOccupancy(const Function & F,unsigned LDSSize,unsigned NumSGPRs,unsigned NumVGPRs) const842 unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
843                                         unsigned NumSGPRs,
844                                         unsigned NumVGPRs) const {
845   unsigned Occupancy =
846     std::min(getMaxWavesPerEU(),
847              getOccupancyWithLocalMemSize(LDSSize, F));
848   if (NumSGPRs)
849     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
850   if (NumVGPRs)
851     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
852   return Occupancy;
853 }
854 
getBaseMaxNumSGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU,unsigned PreloadedSGPRs,unsigned ReservedNumSGPRs) const855 unsigned GCNSubtarget::getBaseMaxNumSGPRs(
856     const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
857     unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
858   // Compute maximum number of SGPRs function can use using default/requested
859   // minimum number of waves per execution unit.
860   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
861   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
862 
863   // Check if maximum number of SGPRs was explicitly requested using
864   // "amdgpu-num-sgpr" attribute.
865   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
866     unsigned Requested = AMDGPU::getIntegerAttribute(
867       F, "amdgpu-num-sgpr", MaxNumSGPRs);
868 
869     // Make sure requested value does not violate subtarget's specifications.
870     if (Requested && (Requested <= ReservedNumSGPRs))
871       Requested = 0;
872 
873     // If more SGPRs are required to support the input user/system SGPRs,
874     // increase to accommodate them.
875     //
876     // FIXME: This really ends up using the requested number of SGPRs + number
877     // of reserved special registers in total. Theoretically you could re-use
878     // the last input registers for these special registers, but this would
879     // require a lot of complexity to deal with the weird aliasing.
880     unsigned InputNumSGPRs = PreloadedSGPRs;
881     if (Requested && Requested < InputNumSGPRs)
882       Requested = InputNumSGPRs;
883 
884     // Make sure requested value is compatible with values implied by
885     // default/requested minimum/maximum number of waves per execution unit.
886     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
887       Requested = 0;
888     if (WavesPerEU.second &&
889         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
890       Requested = 0;
891 
892     if (Requested)
893       MaxNumSGPRs = Requested;
894   }
895 
896   if (hasSGPRInitBug())
897     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
898 
899   return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
900 }
901 
getMaxNumSGPRs(const MachineFunction & MF) const902 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
903   const Function &F = MF.getFunction();
904   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
905   return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(),
906                             getReservedNumSGPRs(MF));
907 }
908 
getMaxNumPreloadedSGPRs()909 static unsigned getMaxNumPreloadedSGPRs() {
910   // Max number of user SGPRs
911   unsigned MaxUserSGPRs = 4 + // private segment buffer
912                           2 + // Dispatch ptr
913                           2 + // queue ptr
914                           2 + // kernel segment ptr
915                           2 + // dispatch ID
916                           2 + // flat scratch init
917                           2;  // Implicit buffer ptr
918   // Max number of system SGPRs
919   unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
920                             1 + // WorkGroupIDY
921                             1 + // WorkGroupIDZ
922                             1 + // WorkGroupInfo
923                             1;  // private segment wave byte offset
924   return MaxUserSGPRs + MaxSystemSGPRs;
925 }
926 
getMaxNumSGPRs(const Function & F) const927 unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const {
928   return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(),
929                             getReservedNumSGPRs(F));
930 }
931 
getBaseMaxNumVGPRs(const Function & F,std::pair<unsigned,unsigned> WavesPerEU) const932 unsigned GCNSubtarget::getBaseMaxNumVGPRs(
933     const Function &F, std::pair<unsigned, unsigned> WavesPerEU) const {
934   // Compute maximum number of VGPRs function can use using default/requested
935   // minimum number of waves per execution unit.
936   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
937 
938   // Check if maximum number of VGPRs was explicitly requested using
939   // "amdgpu-num-vgpr" attribute.
940   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
941     unsigned Requested = AMDGPU::getIntegerAttribute(
942       F, "amdgpu-num-vgpr", MaxNumVGPRs);
943 
944     if (hasGFX90AInsts())
945       Requested *= 2;
946 
947     // Make sure requested value is compatible with values implied by
948     // default/requested minimum/maximum number of waves per execution unit.
949     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
950       Requested = 0;
951     if (WavesPerEU.second &&
952         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
953       Requested = 0;
954 
955     if (Requested)
956       MaxNumVGPRs = Requested;
957   }
958 
959   return MaxNumVGPRs;
960 }
961 
getMaxNumVGPRs(const Function & F) const962 unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const {
963   return getBaseMaxNumVGPRs(F, getWavesPerEU(F));
964 }
965 
getMaxNumVGPRs(const MachineFunction & MF) const966 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
967   const Function &F = MF.getFunction();
968   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
969   return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU());
970 }
971 
adjustSchedDependency(SUnit * Def,int DefOpIdx,SUnit * Use,int UseOpIdx,SDep & Dep) const972 void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
973                                          int UseOpIdx, SDep &Dep) const {
974   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
975       !Def->isInstr() || !Use->isInstr())
976     return;
977 
978   MachineInstr *DefI = Def->getInstr();
979   MachineInstr *UseI = Use->getInstr();
980 
981   if (DefI->isBundle()) {
982     const SIRegisterInfo *TRI = getRegisterInfo();
983     auto Reg = Dep.getReg();
984     MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
985     MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
986     unsigned Lat = 0;
987     for (++I; I != E && I->isBundledWithPred(); ++I) {
988       if (I->modifiesRegister(Reg, TRI))
989         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
990       else if (Lat)
991         --Lat;
992     }
993     Dep.setLatency(Lat);
994   } else if (UseI->isBundle()) {
995     const SIRegisterInfo *TRI = getRegisterInfo();
996     auto Reg = Dep.getReg();
997     MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
998     MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
999     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
1000     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
1001       if (I->readsRegister(Reg, TRI))
1002         break;
1003       --Lat;
1004     }
1005     Dep.setLatency(Lat);
1006   }
1007 }
1008 
1009 namespace {
1010 struct FillMFMAShadowMutation : ScheduleDAGMutation {
1011   const SIInstrInfo *TII;
1012 
1013   ScheduleDAGMI *DAG;
1014 
FillMFMAShadowMutation__anon53de40680111::FillMFMAShadowMutation1015   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
1016 
isSALU__anon53de40680111::FillMFMAShadowMutation1017   bool isSALU(const SUnit *SU) const {
1018     const MachineInstr *MI = SU->getInstr();
1019     return MI && TII->isSALU(*MI) && !MI->isTerminator();
1020   }
1021 
isVALU__anon53de40680111::FillMFMAShadowMutation1022   bool isVALU(const SUnit *SU) const {
1023     const MachineInstr *MI = SU->getInstr();
1024     return MI && TII->isVALU(*MI);
1025   }
1026 
canAddEdge__anon53de40680111::FillMFMAShadowMutation1027   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
1028     if (Pred->NodeNum < Succ->NodeNum)
1029       return true;
1030 
1031     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
1032 
1033     for (unsigned I = 0; I < Succs.size(); ++I) {
1034       for (const SDep &SI : Succs[I]->Succs) {
1035         const SUnit *SU = SI.getSUnit();
1036         if (SU != Succs[I] && !llvm::is_contained(Succs, SU))
1037           Succs.push_back(SU);
1038       }
1039     }
1040 
1041     SmallPtrSet<const SUnit*, 32> Visited;
1042     while (!Preds.empty()) {
1043       const SUnit *SU = Preds.pop_back_val();
1044       if (llvm::is_contained(Succs, SU))
1045         return false;
1046       Visited.insert(SU);
1047       for (const SDep &SI : SU->Preds)
1048         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
1049           Preds.push_back(SI.getSUnit());
1050     }
1051 
1052     return true;
1053   }
1054 
1055   // Link as much SALU intructions in chain as possible. Return the size
1056   // of the chain. Links up to MaxChain instructions.
linkSALUChain__anon53de40680111::FillMFMAShadowMutation1057   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
1058                          SmallPtrSetImpl<SUnit *> &Visited) const {
1059     SmallVector<SUnit *, 8> Worklist({To});
1060     unsigned Linked = 0;
1061 
1062     while (!Worklist.empty() && MaxChain-- > 0) {
1063       SUnit *SU = Worklist.pop_back_val();
1064       if (!Visited.insert(SU).second)
1065         continue;
1066 
1067       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
1068                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
1069 
1070       if (SU->addPred(SDep(From, SDep::Artificial), false))
1071         ++Linked;
1072 
1073       for (SDep &SI : From->Succs) {
1074         SUnit *SUv = SI.getSUnit();
1075         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
1076           SUv->addPred(SDep(SU, SDep::Artificial), false);
1077       }
1078 
1079       for (SDep &SI : SU->Succs) {
1080         SUnit *Succ = SI.getSUnit();
1081         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
1082           Worklist.push_back(Succ);
1083       }
1084     }
1085 
1086     return Linked;
1087   }
1088 
apply__anon53de40680111::FillMFMAShadowMutation1089   void apply(ScheduleDAGInstrs *DAGInstrs) override {
1090     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
1091     if (!ST.hasMAIInsts() || DisablePowerSched)
1092       return;
1093     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
1094     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
1095     if (!TSchedModel || DAG->SUnits.empty())
1096       return;
1097 
1098     // Scan for MFMA long latency instructions and try to add a dependency
1099     // of available SALU instructions to give them a chance to fill MFMA
1100     // shadow. That is desirable to fill MFMA shadow with SALU instructions
1101     // rather than VALU to prevent power consumption bursts and throttle.
1102     auto LastSALU = DAG->SUnits.begin();
1103     auto E = DAG->SUnits.end();
1104     SmallPtrSet<SUnit*, 32> Visited;
1105     for (SUnit &SU : DAG->SUnits) {
1106       MachineInstr &MAI = *SU.getInstr();
1107       if (!TII->isMAI(MAI) ||
1108            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
1109            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64)
1110         continue;
1111 
1112       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
1113 
1114       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
1115                  dbgs() << "Need " << Lat
1116                         << " instructions to cover latency.\n");
1117 
1118       // Find up to Lat independent scalar instructions as early as
1119       // possible such that they can be scheduled after this MFMA.
1120       for ( ; Lat && LastSALU != E; ++LastSALU) {
1121         if (Visited.count(&*LastSALU))
1122           continue;
1123 
1124         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
1125           continue;
1126 
1127         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
1128       }
1129     }
1130   }
1131 };
1132 } // namespace
1133 
getPostRAMutations(std::vector<std::unique_ptr<ScheduleDAGMutation>> & Mutations) const1134 void GCNSubtarget::getPostRAMutations(
1135     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
1136   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
1137 }
1138 
get(const MachineFunction & MF)1139 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
1140   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
1141     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
1142   else
1143     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
1144 }
1145 
get(const TargetMachine & TM,const Function & F)1146 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
1147   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
1148     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
1149   else
1150     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
1151 }
1152