1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 GCNSubtarget::~GCNSubtarget() = default;
49 
50 R600Subtarget &
51 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
52                                                StringRef GPU, StringRef FS) {
53   SmallString<256> FullFS("+promote-alloca,");
54   FullFS += FS;
55   ParseSubtargetFeatures(GPU, FullFS);
56 
57   // FIXME: I don't think think Evergreen has any useful support for
58   // denormals, but should be checked. Should we issue a warning somewhere
59   // if someone tries to enable these?
60   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
61     FP32Denormals = false;
62   }
63 
64   HasMulU24 = getGeneration() >= EVERGREEN;
65   HasMulI24 = hasCaymanISA();
66 
67   return *this;
68 }
69 
70 GCNSubtarget &
71 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
72                                               StringRef GPU, StringRef FS) {
73   // Determine default and user-specified characteristics
74   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
75   // enabled, but some instructions do not respect them and they run at the
76   // double precision rate, so don't enable by default.
77   //
78   // We want to be able to turn these off, but making this a subtarget feature
79   // for SI has the unhelpful behavior that it unsets everything else if you
80   // disable it.
81   //
82   // Similarly we want enable-prt-strict-null to be on by default and not to
83   // unset everything else if it is disabled
84 
85   // Assuming ECC is enabled is the conservative default.
86   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
87 
88   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
89     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
90 
91   // FIXME: I don't think think Evergreen has any useful support for
92   // denormals, but should be checked. Should we issue a warning somewhere
93   // if someone tries to enable these?
94   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
95     FullFS += "+fp64-fp16-denormals,";
96   } else {
97     FullFS += "-fp32-denormals,";
98   }
99 
100   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
101 
102   // Disable mutually exclusive bits.
103   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
104     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
105       FullFS += "-wavefrontsize16,";
106     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
107       FullFS += "-wavefrontsize32,";
108     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
109       FullFS += "-wavefrontsize64,";
110   }
111 
112   FullFS += FS;
113 
114   ParseSubtargetFeatures(GPU, FullFS);
115 
116   // We don't support FP64 for EG/NI atm.
117   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
118 
119   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
120   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
121   // variants of MUBUF instructions.
122   if (!hasAddr64() && !FS.contains("flat-for-global")) {
123     FlatForGlobal = true;
124   }
125 
126   // Set defaults if needed.
127   if (MaxPrivateElementSize == 0)
128     MaxPrivateElementSize = 4;
129 
130   if (LDSBankCount == 0)
131     LDSBankCount = 32;
132 
133   if (TT.getArch() == Triple::amdgcn) {
134     if (LocalMemorySize == 0)
135       LocalMemorySize = 32768;
136 
137     // Do something sensible for unspecified target.
138     if (!HasMovrel && !HasVGPRIndexMode)
139       HasMovrel = true;
140   }
141 
142   // Don't crash on invalid devices.
143   if (WavefrontSize == 0)
144     WavefrontSize = 64;
145 
146   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
147 
148   if (DoesNotSupportXNACK && EnableXNACK) {
149     ToggleFeature(AMDGPU::FeatureXNACK);
150     EnableXNACK = false;
151   }
152 
153   // ECC is on by default, but turn it off if the hardware doesn't support it
154   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
155   // ECC.
156   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
157     ToggleFeature(AMDGPU::FeatureSRAMECC);
158     EnableSRAMECC = false;
159   }
160 
161   return *this;
162 }
163 
164 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
165   TargetTriple(TT),
166   Has16BitInsts(false),
167   HasMadMixInsts(false),
168   FP32Denormals(false),
169   FPExceptions(false),
170   HasSDWA(false),
171   HasVOP3PInsts(false),
172   HasMulI24(true),
173   HasMulU24(true),
174   HasInv2PiInlineImm(false),
175   HasFminFmaxLegacy(true),
176   EnablePromoteAlloca(false),
177   HasTrigReducedRange(false),
178   LocalMemorySize(0),
179   WavefrontSize(0)
180   { }
181 
182 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
183                            const GCNTargetMachine &TM) :
184     AMDGPUGenSubtargetInfo(TT, GPU, FS),
185     AMDGPUSubtarget(TT),
186     TargetTriple(TT),
187     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
188     InstrItins(getInstrItineraryForCPU(GPU)),
189     LDSBankCount(0),
190     MaxPrivateElementSize(0),
191 
192     FastFMAF32(false),
193     HalfRate64Ops(false),
194 
195     FP64FP16Denormals(false),
196     FlatForGlobal(false),
197     AutoWaitcntBeforeBarrier(false),
198     CodeObjectV3(false),
199     UnalignedScratchAccess(false),
200     UnalignedBufferAccess(false),
201 
202     HasApertureRegs(false),
203     EnableXNACK(false),
204     DoesNotSupportXNACK(false),
205     EnableCuMode(false),
206     TrapHandler(false),
207 
208     EnableLoadStoreOpt(false),
209     EnableUnsafeDSOffsetFolding(false),
210     EnableSIScheduler(false),
211     EnableDS128(false),
212     EnablePRTStrictNull(false),
213     DumpCode(false),
214 
215     FP64(false),
216     GCN3Encoding(false),
217     CIInsts(false),
218     GFX8Insts(false),
219     GFX9Insts(false),
220     GFX10Insts(false),
221     GFX7GFX8GFX9Insts(false),
222     SGPRInitBug(false),
223     HasSMemRealTime(false),
224     HasIntClamp(false),
225     HasFmaMixInsts(false),
226     HasMovrel(false),
227     HasVGPRIndexMode(false),
228     HasScalarStores(false),
229     HasScalarAtomics(false),
230     HasSDWAOmod(false),
231     HasSDWAScalar(false),
232     HasSDWASdst(false),
233     HasSDWAMac(false),
234     HasSDWAOutModsVOPC(false),
235     HasDPP(false),
236     HasDPP8(false),
237     HasR128A16(false),
238     HasNSAEncoding(false),
239     HasDLInsts(false),
240     HasDot1Insts(false),
241     HasDot2Insts(false),
242     HasDot3Insts(false),
243     HasDot4Insts(false),
244     HasDot5Insts(false),
245     HasDot6Insts(false),
246     HasMAIInsts(false),
247     HasPkFmacF16Inst(false),
248     HasAtomicFaddInsts(false),
249     EnableSRAMECC(false),
250     DoesNotSupportSRAMECC(false),
251     HasNoSdstCMPX(false),
252     HasVscnt(false),
253     HasRegisterBanking(false),
254     HasVOP3Literal(false),
255     HasNoDataDepHazard(false),
256     FlatAddressSpace(false),
257     FlatInstOffsets(false),
258     FlatGlobalInsts(false),
259     FlatScratchInsts(false),
260     ScalarFlatScratchInsts(false),
261     AddNoCarryInsts(false),
262     HasUnpackedD16VMem(false),
263     LDSMisalignedBug(false),
264 
265     ScalarizeGlobal(false),
266 
267     HasVcmpxPermlaneHazard(false),
268     HasVMEMtoScalarWriteHazard(false),
269     HasSMEMtoVectorWriteHazard(false),
270     HasInstFwdPrefetchBug(false),
271     HasVcmpxExecWARHazard(false),
272     HasLdsBranchVmemWARHazard(false),
273     HasNSAtoVMEMBug(false),
274     HasOffset3fBug(false),
275     HasFlatSegmentOffsetBug(false),
276 
277     FeatureDisable(false),
278     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
279     TLInfo(TM, *this),
280     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
281   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
282   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
283   RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
284   InstSelector.reset(new AMDGPUInstructionSelector(
285   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
286 }
287 
288 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
289   if (getGeneration() < GFX10)
290     return 1;
291 
292   switch (Opcode) {
293   case AMDGPU::V_LSHLREV_B64:
294   case AMDGPU::V_LSHLREV_B64_gfx10:
295   case AMDGPU::V_LSHL_B64:
296   case AMDGPU::V_LSHRREV_B64:
297   case AMDGPU::V_LSHRREV_B64_gfx10:
298   case AMDGPU::V_LSHR_B64:
299   case AMDGPU::V_ASHRREV_I64:
300   case AMDGPU::V_ASHRREV_I64_gfx10:
301   case AMDGPU::V_ASHR_I64:
302     return 1;
303   }
304 
305   return 2;
306 }
307 
308 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
309   const Function &F) const {
310   if (NWaves == 1)
311     return getLocalMemorySize();
312   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
313   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
314   if (!WorkGroupsPerCu)
315     return 0;
316   unsigned MaxWaves = getMaxWavesPerEU();
317   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
318 }
319 
320 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
321   const Function &F) const {
322   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
323   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
324   if (!WorkGroupsPerCu)
325     return 0;
326   unsigned MaxWaves = getMaxWavesPerEU();
327   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
328   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
329   NumWaves = std::min(NumWaves, MaxWaves);
330   NumWaves = std::max(NumWaves, 1u);
331   return NumWaves;
332 }
333 
334 unsigned
335 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
336   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
337   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
338 }
339 
340 std::pair<unsigned, unsigned>
341 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
342   switch (CC) {
343   case CallingConv::AMDGPU_CS:
344   case CallingConv::AMDGPU_KERNEL:
345   case CallingConv::SPIR_KERNEL:
346     return std::make_pair(getWavefrontSize() * 2,
347                           std::max(getWavefrontSize() * 4, 256u));
348   case CallingConv::AMDGPU_VS:
349   case CallingConv::AMDGPU_LS:
350   case CallingConv::AMDGPU_HS:
351   case CallingConv::AMDGPU_ES:
352   case CallingConv::AMDGPU_GS:
353   case CallingConv::AMDGPU_PS:
354     return std::make_pair(1, getWavefrontSize());
355   default:
356     return std::make_pair(1, 16 * getWavefrontSize());
357   }
358 }
359 
360 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
361   const Function &F) const {
362   // FIXME: 1024 if function.
363   // Default minimum/maximum flat work group sizes.
364   std::pair<unsigned, unsigned> Default =
365     getDefaultFlatWorkGroupSize(F.getCallingConv());
366 
367   // Requested minimum/maximum flat work group sizes.
368   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
369     F, "amdgpu-flat-work-group-size", Default);
370 
371   // Make sure requested minimum is less than requested maximum.
372   if (Requested.first > Requested.second)
373     return Default;
374 
375   // Make sure requested values do not violate subtarget's specifications.
376   if (Requested.first < getMinFlatWorkGroupSize())
377     return Default;
378   if (Requested.second > getMaxFlatWorkGroupSize())
379     return Default;
380 
381   return Requested;
382 }
383 
384 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
385   const Function &F) const {
386   // Default minimum/maximum number of waves per execution unit.
387   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
388 
389   // Default/requested minimum/maximum flat work group sizes.
390   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
391 
392   // If minimum/maximum flat work group sizes were explicitly requested using
393   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
394   // number of waves per execution unit to values implied by requested
395   // minimum/maximum flat work group sizes.
396   unsigned MinImpliedByFlatWorkGroupSize =
397     getMaxWavesPerEU(FlatWorkGroupSizes.second);
398   bool RequestedFlatWorkGroupSize = false;
399 
400   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
401     Default.first = MinImpliedByFlatWorkGroupSize;
402     RequestedFlatWorkGroupSize = true;
403   }
404 
405   // Requested minimum/maximum number of waves per execution unit.
406   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
407     F, "amdgpu-waves-per-eu", Default, true);
408 
409   // Make sure requested minimum is less than requested maximum.
410   if (Requested.second && Requested.first > Requested.second)
411     return Default;
412 
413   // Make sure requested values do not violate subtarget's specifications.
414   if (Requested.first < getMinWavesPerEU() ||
415       Requested.first > getMaxWavesPerEU())
416     return Default;
417   if (Requested.second > getMaxWavesPerEU())
418     return Default;
419 
420   // Make sure requested values are compatible with values implied by requested
421   // minimum/maximum flat work group sizes.
422   if (RequestedFlatWorkGroupSize &&
423       Requested.first < MinImpliedByFlatWorkGroupSize)
424     return Default;
425 
426   return Requested;
427 }
428 
429 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
430   Function *Kernel = I->getParent()->getParent();
431   unsigned MinSize = 0;
432   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
433   bool IdQuery = false;
434 
435   // If reqd_work_group_size is present it narrows value down.
436   if (auto *CI = dyn_cast<CallInst>(I)) {
437     const Function *F = CI->getCalledFunction();
438     if (F) {
439       unsigned Dim = UINT_MAX;
440       switch (F->getIntrinsicID()) {
441       case Intrinsic::amdgcn_workitem_id_x:
442       case Intrinsic::r600_read_tidig_x:
443         IdQuery = true;
444         LLVM_FALLTHROUGH;
445       case Intrinsic::r600_read_local_size_x:
446         Dim = 0;
447         break;
448       case Intrinsic::amdgcn_workitem_id_y:
449       case Intrinsic::r600_read_tidig_y:
450         IdQuery = true;
451         LLVM_FALLTHROUGH;
452       case Intrinsic::r600_read_local_size_y:
453         Dim = 1;
454         break;
455       case Intrinsic::amdgcn_workitem_id_z:
456       case Intrinsic::r600_read_tidig_z:
457         IdQuery = true;
458         LLVM_FALLTHROUGH;
459       case Intrinsic::r600_read_local_size_z:
460         Dim = 2;
461         break;
462       default:
463         break;
464       }
465       if (Dim <= 3) {
466         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
467           if (Node->getNumOperands() == 3)
468             MinSize = MaxSize = mdconst::extract<ConstantInt>(
469                                   Node->getOperand(Dim))->getZExtValue();
470       }
471     }
472   }
473 
474   if (!MaxSize)
475     return false;
476 
477   // Range metadata is [Lo, Hi). For ID query we need to pass max size
478   // as Hi. For size query we need to pass Hi + 1.
479   if (IdQuery)
480     MinSize = 0;
481   else
482     ++MaxSize;
483 
484   MDBuilder MDB(I->getContext());
485   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
486                                                   APInt(32, MaxSize));
487   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
488   return true;
489 }
490 
491 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
492                                                  unsigned &MaxAlign) const {
493   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
494          F.getCallingConv() == CallingConv::SPIR_KERNEL);
495 
496   const DataLayout &DL = F.getParent()->getDataLayout();
497   uint64_t ExplicitArgBytes = 0;
498   MaxAlign = 1;
499 
500   for (const Argument &Arg : F.args()) {
501     Type *ArgTy = Arg.getType();
502 
503     unsigned Align = DL.getABITypeAlignment(ArgTy);
504     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
505     ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
506     MaxAlign = std::max(MaxAlign, Align);
507   }
508 
509   return ExplicitArgBytes;
510 }
511 
512 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
513                                                 unsigned &MaxAlign) const {
514   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
515 
516   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
517 
518   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
519   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
520   if (ImplicitBytes != 0) {
521     unsigned Alignment = getAlignmentForImplicitArgPtr();
522     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
523   }
524 
525   // Being able to dereference past the end is useful for emitting scalar loads.
526   return alignTo(TotalSize, 4);
527 }
528 
529 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
530                              const TargetMachine &TM) :
531   R600GenSubtargetInfo(TT, GPU, FS),
532   AMDGPUSubtarget(TT),
533   InstrInfo(*this),
534   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
535   FMA(false),
536   CaymanISA(false),
537   CFALUBug(false),
538   HasVertexCache(false),
539   R600ALUInst(false),
540   FP64(false),
541   TexVTXClauseSize(0),
542   Gen(R600),
543   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
544   InstrItins(getInstrItineraryForCPU(GPU)) { }
545 
546 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
547                                       unsigned NumRegionInstrs) const {
548   // Track register pressure so the scheduler can try to decrease
549   // pressure once register usage is above the threshold defined by
550   // SIRegisterInfo::getRegPressureSetLimit()
551   Policy.ShouldTrackPressure = true;
552 
553   // Enabling both top down and bottom up scheduling seems to give us less
554   // register spills than just using one of these approaches on its own.
555   Policy.OnlyTopDown = false;
556   Policy.OnlyBottomUp = false;
557 
558   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
559   if (!enableSIScheduler())
560     Policy.ShouldTrackLaneMasks = true;
561 }
562 
563 bool GCNSubtarget::hasMadF16() const {
564   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
565 }
566 
567 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
568   if (getGeneration() >= AMDGPUSubtarget::GFX10)
569     return 10;
570 
571   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
572     if (SGPRs <= 80)
573       return 10;
574     if (SGPRs <= 88)
575       return 9;
576     if (SGPRs <= 100)
577       return 8;
578     return 7;
579   }
580   if (SGPRs <= 48)
581     return 10;
582   if (SGPRs <= 56)
583     return 9;
584   if (SGPRs <= 64)
585     return 8;
586   if (SGPRs <= 72)
587     return 7;
588   if (SGPRs <= 80)
589     return 6;
590   return 5;
591 }
592 
593 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
594   if (VGPRs <= 24)
595     return 10;
596   if (VGPRs <= 28)
597     return 9;
598   if (VGPRs <= 32)
599     return 8;
600   if (VGPRs <= 36)
601     return 7;
602   if (VGPRs <= 40)
603     return 6;
604   if (VGPRs <= 48)
605     return 5;
606   if (VGPRs <= 64)
607     return 4;
608   if (VGPRs <= 84)
609     return 3;
610   if (VGPRs <= 128)
611     return 2;
612   return 1;
613 }
614 
615 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
616   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
617   if (getGeneration() >= AMDGPUSubtarget::GFX10)
618     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
619 
620   if (MFI.hasFlatScratchInit()) {
621     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
622       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
623     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
624       return 4; // FLAT_SCRATCH, VCC (in that order).
625   }
626 
627   if (isXNACKEnabled())
628     return 4; // XNACK, VCC (in that order).
629   return 2; // VCC.
630 }
631 
632 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
633   const Function &F = MF.getFunction();
634   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
635 
636   // Compute maximum number of SGPRs function can use using default/requested
637   // minimum number of waves per execution unit.
638   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
639   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
640   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
641 
642   // Check if maximum number of SGPRs was explicitly requested using
643   // "amdgpu-num-sgpr" attribute.
644   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
645     unsigned Requested = AMDGPU::getIntegerAttribute(
646       F, "amdgpu-num-sgpr", MaxNumSGPRs);
647 
648     // Make sure requested value does not violate subtarget's specifications.
649     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
650       Requested = 0;
651 
652     // If more SGPRs are required to support the input user/system SGPRs,
653     // increase to accommodate them.
654     //
655     // FIXME: This really ends up using the requested number of SGPRs + number
656     // of reserved special registers in total. Theoretically you could re-use
657     // the last input registers for these special registers, but this would
658     // require a lot of complexity to deal with the weird aliasing.
659     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
660     if (Requested && Requested < InputNumSGPRs)
661       Requested = InputNumSGPRs;
662 
663     // Make sure requested value is compatible with values implied by
664     // default/requested minimum/maximum number of waves per execution unit.
665     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
666       Requested = 0;
667     if (WavesPerEU.second &&
668         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
669       Requested = 0;
670 
671     if (Requested)
672       MaxNumSGPRs = Requested;
673   }
674 
675   if (hasSGPRInitBug())
676     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
677 
678   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
679                   MaxAddressableNumSGPRs);
680 }
681 
682 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
683   const Function &F = MF.getFunction();
684   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
685 
686   // Compute maximum number of VGPRs function can use using default/requested
687   // minimum number of waves per execution unit.
688   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
689   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
690 
691   // Check if maximum number of VGPRs was explicitly requested using
692   // "amdgpu-num-vgpr" attribute.
693   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
694     unsigned Requested = AMDGPU::getIntegerAttribute(
695       F, "amdgpu-num-vgpr", MaxNumVGPRs);
696 
697     // Make sure requested value is compatible with values implied by
698     // default/requested minimum/maximum number of waves per execution unit.
699     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
700       Requested = 0;
701     if (WavesPerEU.second &&
702         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
703       Requested = 0;
704 
705     if (Requested)
706       MaxNumVGPRs = Requested;
707   }
708 
709   return MaxNumVGPRs;
710 }
711 
712 namespace {
713 struct MemOpClusterMutation : ScheduleDAGMutation {
714   const SIInstrInfo *TII;
715 
716   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
717 
718   void apply(ScheduleDAGInstrs *DAG) override {
719     SUnit *SUa = nullptr;
720     // Search for two consequent memory operations and link them
721     // to prevent scheduler from moving them apart.
722     // In DAG pre-process SUnits are in the original order of
723     // the instructions before scheduling.
724     for (SUnit &SU : DAG->SUnits) {
725       MachineInstr &MI2 = *SU.getInstr();
726       if (!MI2.mayLoad() && !MI2.mayStore()) {
727         SUa = nullptr;
728         continue;
729       }
730       if (!SUa) {
731         SUa = &SU;
732         continue;
733       }
734 
735       MachineInstr &MI1 = *SUa->getInstr();
736       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
737           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
738           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
739           (TII->isDS(MI1)   && TII->isDS(MI2))) {
740         SU.addPredBarrier(SUa);
741 
742         for (const SDep &SI : SU.Preds) {
743           if (SI.getSUnit() != SUa)
744             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
745         }
746 
747         if (&SU != &DAG->ExitSU) {
748           for (const SDep &SI : SUa->Succs) {
749             if (SI.getSUnit() != &SU)
750               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
751           }
752         }
753       }
754 
755       SUa = &SU;
756     }
757   }
758 };
759 
760 struct FillMFMAShadowMutation : ScheduleDAGMutation {
761   const SIInstrInfo *TII;
762 
763   ScheduleDAGMI *DAG;
764 
765   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
766 
767   bool isSALU(const SUnit *SU) const {
768     const MachineInstr *MI = SU->getInstr();
769     return MI && TII->isSALU(*MI) && !MI->isTerminator();
770   }
771 
772   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
773     if (Pred->NodeNum < Succ->NodeNum)
774       return true;
775 
776     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
777 
778     for (unsigned I = 0; I < Succs.size(); ++I) {
779       for (const SDep &SI : Succs[I]->Succs) {
780         const SUnit *SU = SI.getSUnit();
781         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
782           Succs.push_back(SU);
783       }
784     }
785 
786     SmallPtrSet<const SUnit*, 32> Visited;
787     while (!Preds.empty()) {
788       const SUnit *SU = Preds.pop_back_val();
789       if (llvm::find(Succs, SU) != Succs.end())
790         return false;
791       Visited.insert(SU);
792       for (const SDep &SI : SU->Preds)
793         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
794           Preds.push_back(SI.getSUnit());
795     }
796 
797     return true;
798   }
799 
800   // Link as much SALU intructions in chain as possible. Return the size
801   // of the chain. Links up to MaxChain instructions.
802   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
803                          SmallPtrSetImpl<SUnit *> &Visited) const {
804     SmallVector<SUnit *, 8> Worklist({To});
805     unsigned Linked = 0;
806 
807     while (!Worklist.empty() && MaxChain-- > 0) {
808       SUnit *SU = Worklist.pop_back_val();
809       if (!Visited.insert(SU).second)
810         continue;
811 
812       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
813                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
814 
815       if (SU->addPred(SDep(From, SDep::Artificial), false))
816         ++Linked;
817 
818       for (SDep &SI : From->Succs) {
819         SUnit *SUv = SI.getSUnit();
820         if (SUv != From && TII->isVALU(*SUv->getInstr()) && canAddEdge(SUv, SU))
821           SUv->addPred(SDep(SU, SDep::Artificial), false);
822       }
823 
824       for (SDep &SI : SU->Succs) {
825         SUnit *Succ = SI.getSUnit();
826         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
827           Worklist.push_back(Succ);
828       }
829     }
830 
831     return Linked;
832   }
833 
834   void apply(ScheduleDAGInstrs *DAGInstrs) override {
835     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
836     if (!ST.hasMAIInsts() || DisablePowerSched)
837       return;
838     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
839     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
840     if (!TSchedModel || DAG->SUnits.empty())
841       return;
842 
843     // Scan for MFMA long latency instructions and try to add a dependency
844     // of available SALU instructions to give them a chance to fill MFMA
845     // shadow. That is desirable to fill MFMA shadow with SALU instructions
846     // rather than VALU to prevent power consumption bursts and throttle.
847     auto LastSALU = DAG->SUnits.begin();
848     auto E = DAG->SUnits.end();
849     SmallPtrSet<SUnit*, 32> Visited;
850     for (SUnit &SU : DAG->SUnits) {
851       MachineInstr &MAI = *SU.getInstr();
852       if (!TII->isMAI(MAI) ||
853            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
854            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
855         continue;
856 
857       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
858 
859       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
860                  dbgs() << "Need " << Lat
861                         << " instructions to cover latency.\n");
862 
863       // Find up to Lat independent scalar instructions as early as
864       // possible such that they can be scheduled after this MFMA.
865       for ( ; Lat && LastSALU != E; ++LastSALU) {
866         if (Visited.count(&*LastSALU))
867           continue;
868 
869         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
870           continue;
871 
872         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
873       }
874     }
875   }
876 };
877 } // namespace
878 
879 void GCNSubtarget::getPostRAMutations(
880     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
881   Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
882   Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
883 }
884 
885 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
886   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
887     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
888   else
889     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
890 }
891 
892 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
893   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
894     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
895   else
896     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
897 }
898