1 //===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Implements the AMDGPU specific subclass of TargetSubtarget.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUSubtarget.h"
15 #include "AMDGPU.h"
16 #include "AMDGPUTargetMachine.h"
17 #include "AMDGPUCallLowering.h"
18 #include "AMDGPUInstructionSelector.h"
19 #include "AMDGPULegalizerInfo.h"
20 #include "AMDGPURegisterBankInfo.h"
21 #include "SIMachineFunctionInfo.h"
22 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
23 #include "llvm/ADT/SmallString.h"
24 #include "llvm/CodeGen/MachineScheduler.h"
25 #include "llvm/MC/MCSubtargetInfo.h"
26 #include "llvm/IR/MDBuilder.h"
27 #include "llvm/CodeGen/TargetFrameLowering.h"
28 #include <algorithm>
29 
30 using namespace llvm;
31 
32 #define DEBUG_TYPE "amdgpu-subtarget"
33 
34 #define GET_SUBTARGETINFO_TARGET_DESC
35 #define GET_SUBTARGETINFO_CTOR
36 #define AMDGPUSubtarget GCNSubtarget
37 #include "AMDGPUGenSubtargetInfo.inc"
38 #define GET_SUBTARGETINFO_TARGET_DESC
39 #define GET_SUBTARGETINFO_CTOR
40 #undef AMDGPUSubtarget
41 #include "R600GenSubtargetInfo.inc"
42 
43 static cl::opt<bool> DisablePowerSched(
44   "amdgpu-disable-power-sched",
45   cl::desc("Disable scheduling to minimize mAI power bursts"),
46   cl::init(false));
47 
48 static cl::opt<bool> EnableVGPRIndexMode(
49   "amdgpu-vgpr-index-mode",
50   cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
51   cl::init(false));
52 
53 GCNSubtarget::~GCNSubtarget() = default;
54 
55 R600Subtarget &
56 R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
57                                                StringRef GPU, StringRef FS) {
58   SmallString<256> FullFS("+promote-alloca,");
59   FullFS += FS;
60   ParseSubtargetFeatures(GPU, FullFS);
61 
62   // FIXME: I don't think think Evergreen has any useful support for
63   // denormals, but should be checked. Should we issue a warning somewhere
64   // if someone tries to enable these?
65   if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
66     FP32Denormals = false;
67   }
68 
69   HasMulU24 = getGeneration() >= EVERGREEN;
70   HasMulI24 = hasCaymanISA();
71 
72   return *this;
73 }
74 
75 GCNSubtarget &
76 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
77                                               StringRef GPU, StringRef FS) {
78   // Determine default and user-specified characteristics
79   // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
80   // enabled, but some instructions do not respect them and they run at the
81   // double precision rate, so don't enable by default.
82   //
83   // We want to be able to turn these off, but making this a subtarget feature
84   // for SI has the unhelpful behavior that it unsets everything else if you
85   // disable it.
86   //
87   // Similarly we want enable-prt-strict-null to be on by default and not to
88   // unset everything else if it is disabled
89 
90   // Assuming ECC is enabled is the conservative default.
91   SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
92 
93   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
94     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
95 
96   // FIXME: I don't think think Evergreen has any useful support for
97   // denormals, but should be checked. Should we issue a warning somewhere
98   // if someone tries to enable these?
99   if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
100     FullFS += "+fp64-fp16-denormals,";
101   } else {
102     FullFS += "-fp32-denormals,";
103   }
104 
105   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
106 
107   // Disable mutually exclusive bits.
108   if (FS.find_lower("+wavefrontsize") != StringRef::npos) {
109     if (FS.find_lower("wavefrontsize16") == StringRef::npos)
110       FullFS += "-wavefrontsize16,";
111     if (FS.find_lower("wavefrontsize32") == StringRef::npos)
112       FullFS += "-wavefrontsize32,";
113     if (FS.find_lower("wavefrontsize64") == StringRef::npos)
114       FullFS += "-wavefrontsize64,";
115   }
116 
117   FullFS += FS;
118 
119   ParseSubtargetFeatures(GPU, FullFS);
120 
121   // We don't support FP64 for EG/NI atm.
122   assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS));
123 
124   // Unless +-flat-for-global is specified, turn on FlatForGlobal for all OS-es
125   // on VI and newer hardware to avoid assertion failures due to missing ADDR64
126   // variants of MUBUF instructions.
127   if (!hasAddr64() && !FS.contains("flat-for-global")) {
128     FlatForGlobal = true;
129   }
130 
131   // Set defaults if needed.
132   if (MaxPrivateElementSize == 0)
133     MaxPrivateElementSize = 4;
134 
135   if (LDSBankCount == 0)
136     LDSBankCount = 32;
137 
138   if (TT.getArch() == Triple::amdgcn) {
139     if (LocalMemorySize == 0)
140       LocalMemorySize = 32768;
141 
142     // Do something sensible for unspecified target.
143     if (!HasMovrel && !HasVGPRIndexMode)
144       HasMovrel = true;
145   }
146 
147   // Don't crash on invalid devices.
148   if (WavefrontSize == 0)
149     WavefrontSize = 64;
150 
151   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
152 
153   if (DoesNotSupportXNACK && EnableXNACK) {
154     ToggleFeature(AMDGPU::FeatureXNACK);
155     EnableXNACK = false;
156   }
157 
158   // ECC is on by default, but turn it off if the hardware doesn't support it
159   // anyway. This matters for the gfx9 targets with d16 loads, but don't support
160   // ECC.
161   if (DoesNotSupportSRAMECC && EnableSRAMECC) {
162     ToggleFeature(AMDGPU::FeatureSRAMECC);
163     EnableSRAMECC = false;
164   }
165 
166   return *this;
167 }
168 
169 AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
170   TargetTriple(TT),
171   Has16BitInsts(false),
172   HasMadMixInsts(false),
173   FP32Denormals(false),
174   FPExceptions(false),
175   HasSDWA(false),
176   HasVOP3PInsts(false),
177   HasMulI24(true),
178   HasMulU24(true),
179   HasInv2PiInlineImm(false),
180   HasFminFmaxLegacy(true),
181   EnablePromoteAlloca(false),
182   HasTrigReducedRange(false),
183   MaxWavesPerEU(10),
184   LocalMemorySize(0),
185   WavefrontSize(0)
186   { }
187 
188 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
189                            const GCNTargetMachine &TM) :
190     AMDGPUGenSubtargetInfo(TT, GPU, FS),
191     AMDGPUSubtarget(TT),
192     TargetTriple(TT),
193     Gen(TT.getOS() == Triple::AMDHSA ? SEA_ISLANDS : SOUTHERN_ISLANDS),
194     InstrItins(getInstrItineraryForCPU(GPU)),
195     LDSBankCount(0),
196     MaxPrivateElementSize(0),
197 
198     FastFMAF32(false),
199     HalfRate64Ops(false),
200 
201     FP64FP16Denormals(false),
202     FlatForGlobal(false),
203     AutoWaitcntBeforeBarrier(false),
204     CodeObjectV3(false),
205     UnalignedScratchAccess(false),
206     UnalignedBufferAccess(false),
207 
208     HasApertureRegs(false),
209     EnableXNACK(false),
210     DoesNotSupportXNACK(false),
211     EnableCuMode(false),
212     TrapHandler(false),
213 
214     EnableLoadStoreOpt(false),
215     EnableUnsafeDSOffsetFolding(false),
216     EnableSIScheduler(false),
217     EnableDS128(false),
218     EnablePRTStrictNull(false),
219     DumpCode(false),
220 
221     FP64(false),
222     GCN3Encoding(false),
223     CIInsts(false),
224     GFX8Insts(false),
225     GFX9Insts(false),
226     GFX10Insts(false),
227     GFX7GFX8GFX9Insts(false),
228     SGPRInitBug(false),
229     HasSMemRealTime(false),
230     HasIntClamp(false),
231     HasFmaMixInsts(false),
232     HasMovrel(false),
233     HasVGPRIndexMode(false),
234     HasScalarStores(false),
235     HasScalarAtomics(false),
236     HasSDWAOmod(false),
237     HasSDWAScalar(false),
238     HasSDWASdst(false),
239     HasSDWAMac(false),
240     HasSDWAOutModsVOPC(false),
241     HasDPP(false),
242     HasDPP8(false),
243     HasR128A16(false),
244     HasNSAEncoding(false),
245     HasDLInsts(false),
246     HasDot1Insts(false),
247     HasDot2Insts(false),
248     HasDot3Insts(false),
249     HasDot4Insts(false),
250     HasDot5Insts(false),
251     HasDot6Insts(false),
252     HasMAIInsts(false),
253     HasPkFmacF16Inst(false),
254     HasAtomicFaddInsts(false),
255     EnableSRAMECC(false),
256     DoesNotSupportSRAMECC(false),
257     HasNoSdstCMPX(false),
258     HasVscnt(false),
259     HasRegisterBanking(false),
260     HasVOP3Literal(false),
261     HasNoDataDepHazard(false),
262     FlatAddressSpace(false),
263     FlatInstOffsets(false),
264     FlatGlobalInsts(false),
265     FlatScratchInsts(false),
266     ScalarFlatScratchInsts(false),
267     AddNoCarryInsts(false),
268     HasUnpackedD16VMem(false),
269     LDSMisalignedBug(false),
270     HasMFMAInlineLiteralBug(false),
271 
272     ScalarizeGlobal(false),
273 
274     HasVcmpxPermlaneHazard(false),
275     HasVMEMtoScalarWriteHazard(false),
276     HasSMEMtoVectorWriteHazard(false),
277     HasInstFwdPrefetchBug(false),
278     HasVcmpxExecWARHazard(false),
279     HasLdsBranchVmemWARHazard(false),
280     HasNSAtoVMEMBug(false),
281     HasOffset3fBug(false),
282     HasFlatSegmentOffsetBug(false),
283 
284     FeatureDisable(false),
285     InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
286     TLInfo(TM, *this),
287     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
288   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
289   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
290   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
291   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
292   InstSelector.reset(new AMDGPUInstructionSelector(
293   *this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
294 }
295 
296 unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
297   if (getGeneration() < GFX10)
298     return 1;
299 
300   switch (Opcode) {
301   case AMDGPU::V_LSHLREV_B64:
302   case AMDGPU::V_LSHLREV_B64_gfx10:
303   case AMDGPU::V_LSHL_B64:
304   case AMDGPU::V_LSHRREV_B64:
305   case AMDGPU::V_LSHRREV_B64_gfx10:
306   case AMDGPU::V_LSHR_B64:
307   case AMDGPU::V_ASHRREV_I64:
308   case AMDGPU::V_ASHRREV_I64_gfx10:
309   case AMDGPU::V_ASHR_I64:
310     return 1;
311   }
312 
313   return 2;
314 }
315 
316 unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
317   const Function &F) const {
318   if (NWaves == 1)
319     return getLocalMemorySize();
320   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
321   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
322   if (!WorkGroupsPerCu)
323     return 0;
324   unsigned MaxWaves = getMaxWavesPerEU();
325   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
326 }
327 
328 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
329   const Function &F) const {
330   unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
331   unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
332   if (!WorkGroupsPerCu)
333     return 0;
334   unsigned MaxWaves = getMaxWavesPerEU();
335   unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
336   unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
337   NumWaves = std::min(NumWaves, MaxWaves);
338   NumWaves = std::max(NumWaves, 1u);
339   return NumWaves;
340 }
341 
342 unsigned
343 AMDGPUSubtarget::getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
344   const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
345   return getOccupancyWithLocalMemSize(MFI->getLDSSize(), MF.getFunction());
346 }
347 
348 std::pair<unsigned, unsigned>
349 AMDGPUSubtarget::getDefaultFlatWorkGroupSize(CallingConv::ID CC) const {
350   switch (CC) {
351   case CallingConv::AMDGPU_VS:
352   case CallingConv::AMDGPU_LS:
353   case CallingConv::AMDGPU_HS:
354   case CallingConv::AMDGPU_ES:
355   case CallingConv::AMDGPU_GS:
356   case CallingConv::AMDGPU_PS:
357     return std::make_pair(1, getWavefrontSize());
358   default:
359     return std::make_pair(1u, getMaxFlatWorkGroupSize());
360   }
361 }
362 
363 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
364   const Function &F) const {
365   // Default minimum/maximum flat work group sizes.
366   std::pair<unsigned, unsigned> Default =
367     getDefaultFlatWorkGroupSize(F.getCallingConv());
368 
369   // Requested minimum/maximum flat work group sizes.
370   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
371     F, "amdgpu-flat-work-group-size", Default);
372 
373   // Make sure requested minimum is less than requested maximum.
374   if (Requested.first > Requested.second)
375     return Default;
376 
377   // Make sure requested values do not violate subtarget's specifications.
378   if (Requested.first < getMinFlatWorkGroupSize())
379     return Default;
380   if (Requested.second > getMaxFlatWorkGroupSize())
381     return Default;
382 
383   return Requested;
384 }
385 
386 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
387   const Function &F) const {
388   // Default minimum/maximum number of waves per execution unit.
389   std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
390 
391   // Default/requested minimum/maximum flat work group sizes.
392   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
393 
394   // If minimum/maximum flat work group sizes were explicitly requested using
395   // "amdgpu-flat-work-group-size" attribute, then set default minimum/maximum
396   // number of waves per execution unit to values implied by requested
397   // minimum/maximum flat work group sizes.
398   unsigned MinImpliedByFlatWorkGroupSize =
399     getMaxWavesPerEU(FlatWorkGroupSizes.second);
400   bool RequestedFlatWorkGroupSize = false;
401 
402   if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
403     Default.first = MinImpliedByFlatWorkGroupSize;
404     RequestedFlatWorkGroupSize = true;
405   }
406 
407   // Requested minimum/maximum number of waves per execution unit.
408   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
409     F, "amdgpu-waves-per-eu", Default, true);
410 
411   // Make sure requested minimum is less than requested maximum.
412   if (Requested.second && Requested.first > Requested.second)
413     return Default;
414 
415   // Make sure requested values do not violate subtarget's specifications.
416   if (Requested.first < getMinWavesPerEU() ||
417       Requested.first > getMaxWavesPerEU())
418     return Default;
419   if (Requested.second > getMaxWavesPerEU())
420     return Default;
421 
422   // Make sure requested values are compatible with values implied by requested
423   // minimum/maximum flat work group sizes.
424   if (RequestedFlatWorkGroupSize &&
425       Requested.first < MinImpliedByFlatWorkGroupSize)
426     return Default;
427 
428   return Requested;
429 }
430 
431 bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
432   Function *Kernel = I->getParent()->getParent();
433   unsigned MinSize = 0;
434   unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
435   bool IdQuery = false;
436 
437   // If reqd_work_group_size is present it narrows value down.
438   if (auto *CI = dyn_cast<CallInst>(I)) {
439     const Function *F = CI->getCalledFunction();
440     if (F) {
441       unsigned Dim = UINT_MAX;
442       switch (F->getIntrinsicID()) {
443       case Intrinsic::amdgcn_workitem_id_x:
444       case Intrinsic::r600_read_tidig_x:
445         IdQuery = true;
446         LLVM_FALLTHROUGH;
447       case Intrinsic::r600_read_local_size_x:
448         Dim = 0;
449         break;
450       case Intrinsic::amdgcn_workitem_id_y:
451       case Intrinsic::r600_read_tidig_y:
452         IdQuery = true;
453         LLVM_FALLTHROUGH;
454       case Intrinsic::r600_read_local_size_y:
455         Dim = 1;
456         break;
457       case Intrinsic::amdgcn_workitem_id_z:
458       case Intrinsic::r600_read_tidig_z:
459         IdQuery = true;
460         LLVM_FALLTHROUGH;
461       case Intrinsic::r600_read_local_size_z:
462         Dim = 2;
463         break;
464       default:
465         break;
466       }
467       if (Dim <= 3) {
468         if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
469           if (Node->getNumOperands() == 3)
470             MinSize = MaxSize = mdconst::extract<ConstantInt>(
471                                   Node->getOperand(Dim))->getZExtValue();
472       }
473     }
474   }
475 
476   if (!MaxSize)
477     return false;
478 
479   // Range metadata is [Lo, Hi). For ID query we need to pass max size
480   // as Hi. For size query we need to pass Hi + 1.
481   if (IdQuery)
482     MinSize = 0;
483   else
484     ++MaxSize;
485 
486   MDBuilder MDB(I->getContext());
487   MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
488                                                   APInt(32, MaxSize));
489   I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
490   return true;
491 }
492 
493 uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
494                                                  Align &MaxAlign) const {
495   assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
496          F.getCallingConv() == CallingConv::SPIR_KERNEL);
497 
498   const DataLayout &DL = F.getParent()->getDataLayout();
499   uint64_t ExplicitArgBytes = 0;
500   MaxAlign = Align::None();
501 
502   for (const Argument &Arg : F.args()) {
503     Type *ArgTy = Arg.getType();
504 
505     const Align Alignment(DL.getABITypeAlignment(ArgTy));
506     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
507     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
508     MaxAlign = std::max(MaxAlign, Alignment);
509   }
510 
511   return ExplicitArgBytes;
512 }
513 
514 unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
515                                                 Align &MaxAlign) const {
516   uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
517 
518   unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
519 
520   uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
521   unsigned ImplicitBytes = getImplicitArgNumBytes(F);
522   if (ImplicitBytes != 0) {
523     const Align Alignment = getAlignmentForImplicitArgPtr();
524     TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
525   }
526 
527   // Being able to dereference past the end is useful for emitting scalar loads.
528   return alignTo(TotalSize, 4);
529 }
530 
531 R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
532                              const TargetMachine &TM) :
533   R600GenSubtargetInfo(TT, GPU, FS),
534   AMDGPUSubtarget(TT),
535   InstrInfo(*this),
536   FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0),
537   FMA(false),
538   CaymanISA(false),
539   CFALUBug(false),
540   HasVertexCache(false),
541   R600ALUInst(false),
542   FP64(false),
543   TexVTXClauseSize(0),
544   Gen(R600),
545   TLInfo(TM, initializeSubtargetDependencies(TT, GPU, FS)),
546   InstrItins(getInstrItineraryForCPU(GPU)) { }
547 
548 void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
549                                       unsigned NumRegionInstrs) const {
550   // Track register pressure so the scheduler can try to decrease
551   // pressure once register usage is above the threshold defined by
552   // SIRegisterInfo::getRegPressureSetLimit()
553   Policy.ShouldTrackPressure = true;
554 
555   // Enabling both top down and bottom up scheduling seems to give us less
556   // register spills than just using one of these approaches on its own.
557   Policy.OnlyTopDown = false;
558   Policy.OnlyBottomUp = false;
559 
560   // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
561   if (!enableSIScheduler())
562     Policy.ShouldTrackLaneMasks = true;
563 }
564 
565 bool GCNSubtarget::hasMadF16() const {
566   return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16) != -1;
567 }
568 
569 bool GCNSubtarget::useVGPRIndexMode() const {
570   return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
571 }
572 
573 unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
574   if (getGeneration() >= AMDGPUSubtarget::GFX10)
575     return getMaxWavesPerEU();
576 
577   if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
578     if (SGPRs <= 80)
579       return 10;
580     if (SGPRs <= 88)
581       return 9;
582     if (SGPRs <= 100)
583       return 8;
584     return 7;
585   }
586   if (SGPRs <= 48)
587     return 10;
588   if (SGPRs <= 56)
589     return 9;
590   if (SGPRs <= 64)
591     return 8;
592   if (SGPRs <= 72)
593     return 7;
594   if (SGPRs <= 80)
595     return 6;
596   return 5;
597 }
598 
599 unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
600   unsigned MaxWaves = getMaxWavesPerEU();
601   unsigned Granule = getVGPRAllocGranule();
602   if (VGPRs < Granule)
603     return MaxWaves;
604   unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
605   return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
606 }
607 
608 unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
609   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
610   if (getGeneration() >= AMDGPUSubtarget::GFX10)
611     return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
612 
613   if (MFI.hasFlatScratchInit()) {
614     if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
615       return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
616     if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
617       return 4; // FLAT_SCRATCH, VCC (in that order).
618   }
619 
620   if (isXNACKEnabled())
621     return 4; // XNACK, VCC (in that order).
622   return 2; // VCC.
623 }
624 
625 unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
626                                         unsigned LDSSize,
627                                         unsigned NumSGPRs,
628                                         unsigned NumVGPRs) const {
629   unsigned Occupancy =
630     std::min(getMaxWavesPerEU(),
631              getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
632   if (NumSGPRs)
633     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
634   if (NumVGPRs)
635     Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
636   return Occupancy;
637 }
638 
639 unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
640   const Function &F = MF.getFunction();
641   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
642 
643   // Compute maximum number of SGPRs function can use using default/requested
644   // minimum number of waves per execution unit.
645   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
646   unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
647   unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
648 
649   // Check if maximum number of SGPRs was explicitly requested using
650   // "amdgpu-num-sgpr" attribute.
651   if (F.hasFnAttribute("amdgpu-num-sgpr")) {
652     unsigned Requested = AMDGPU::getIntegerAttribute(
653       F, "amdgpu-num-sgpr", MaxNumSGPRs);
654 
655     // Make sure requested value does not violate subtarget's specifications.
656     if (Requested && (Requested <= getReservedNumSGPRs(MF)))
657       Requested = 0;
658 
659     // If more SGPRs are required to support the input user/system SGPRs,
660     // increase to accommodate them.
661     //
662     // FIXME: This really ends up using the requested number of SGPRs + number
663     // of reserved special registers in total. Theoretically you could re-use
664     // the last input registers for these special registers, but this would
665     // require a lot of complexity to deal with the weird aliasing.
666     unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
667     if (Requested && Requested < InputNumSGPRs)
668       Requested = InputNumSGPRs;
669 
670     // Make sure requested value is compatible with values implied by
671     // default/requested minimum/maximum number of waves per execution unit.
672     if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
673       Requested = 0;
674     if (WavesPerEU.second &&
675         Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
676       Requested = 0;
677 
678     if (Requested)
679       MaxNumSGPRs = Requested;
680   }
681 
682   if (hasSGPRInitBug())
683     MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
684 
685   return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
686                   MaxAddressableNumSGPRs);
687 }
688 
689 unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
690   const Function &F = MF.getFunction();
691   const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
692 
693   // Compute maximum number of VGPRs function can use using default/requested
694   // minimum number of waves per execution unit.
695   std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
696   unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
697 
698   // Check if maximum number of VGPRs was explicitly requested using
699   // "amdgpu-num-vgpr" attribute.
700   if (F.hasFnAttribute("amdgpu-num-vgpr")) {
701     unsigned Requested = AMDGPU::getIntegerAttribute(
702       F, "amdgpu-num-vgpr", MaxNumVGPRs);
703 
704     // Make sure requested value is compatible with values implied by
705     // default/requested minimum/maximum number of waves per execution unit.
706     if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
707       Requested = 0;
708     if (WavesPerEU.second &&
709         Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
710       Requested = 0;
711 
712     if (Requested)
713       MaxNumVGPRs = Requested;
714   }
715 
716   return MaxNumVGPRs;
717 }
718 
719 void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
720                                          SDep &Dep) const {
721   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
722       !Src->isInstr() || !Dst->isInstr())
723     return;
724 
725   MachineInstr *SrcI = Src->getInstr();
726   MachineInstr *DstI = Dst->getInstr();
727 
728   if (SrcI->isBundle()) {
729     const SIRegisterInfo *TRI = getRegisterInfo();
730     auto Reg = Dep.getReg();
731     MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
732     MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
733     unsigned Lat = 0;
734     for (++I; I != E && I->isBundledWithPred(); ++I) {
735       if (I->modifiesRegister(Reg, TRI))
736         Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
737       else if (Lat)
738         --Lat;
739     }
740     Dep.setLatency(Lat);
741   } else if (DstI->isBundle()) {
742     const SIRegisterInfo *TRI = getRegisterInfo();
743     auto Reg = Dep.getReg();
744     MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
745     MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
746     unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
747     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
748       if (I->readsRegister(Reg, TRI))
749         break;
750       --Lat;
751     }
752     Dep.setLatency(Lat);
753   }
754 }
755 
756 namespace {
757 struct MemOpClusterMutation : ScheduleDAGMutation {
758   const SIInstrInfo *TII;
759 
760   MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
761 
762   void apply(ScheduleDAGInstrs *DAG) override {
763     SUnit *SUa = nullptr;
764     // Search for two consequent memory operations and link them
765     // to prevent scheduler from moving them apart.
766     // In DAG pre-process SUnits are in the original order of
767     // the instructions before scheduling.
768     for (SUnit &SU : DAG->SUnits) {
769       MachineInstr &MI2 = *SU.getInstr();
770       if (!MI2.mayLoad() && !MI2.mayStore()) {
771         SUa = nullptr;
772         continue;
773       }
774       if (!SUa) {
775         SUa = &SU;
776         continue;
777       }
778 
779       MachineInstr &MI1 = *SUa->getInstr();
780       if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
781           (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
782           (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
783           (TII->isDS(MI1)   && TII->isDS(MI2))) {
784         SU.addPredBarrier(SUa);
785 
786         for (const SDep &SI : SU.Preds) {
787           if (SI.getSUnit() != SUa)
788             SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
789         }
790 
791         if (&SU != &DAG->ExitSU) {
792           for (const SDep &SI : SUa->Succs) {
793             if (SI.getSUnit() != &SU)
794               SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
795           }
796         }
797       }
798 
799       SUa = &SU;
800     }
801   }
802 };
803 
804 struct FillMFMAShadowMutation : ScheduleDAGMutation {
805   const SIInstrInfo *TII;
806 
807   ScheduleDAGMI *DAG;
808 
809   FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {}
810 
811   bool isSALU(const SUnit *SU) const {
812     const MachineInstr *MI = SU->getInstr();
813     return MI && TII->isSALU(*MI) && !MI->isTerminator();
814   }
815 
816   bool isVALU(const SUnit *SU) const {
817     const MachineInstr *MI = SU->getInstr();
818     return MI && TII->isVALU(*MI);
819   }
820 
821   bool canAddEdge(const SUnit *Succ, const SUnit *Pred) const {
822     if (Pred->NodeNum < Succ->NodeNum)
823       return true;
824 
825     SmallVector<const SUnit*, 64> Succs({Succ}), Preds({Pred});
826 
827     for (unsigned I = 0; I < Succs.size(); ++I) {
828       for (const SDep &SI : Succs[I]->Succs) {
829         const SUnit *SU = SI.getSUnit();
830         if (SU != Succs[I] && llvm::find(Succs, SU) == Succs.end())
831           Succs.push_back(SU);
832       }
833     }
834 
835     SmallPtrSet<const SUnit*, 32> Visited;
836     while (!Preds.empty()) {
837       const SUnit *SU = Preds.pop_back_val();
838       if (llvm::find(Succs, SU) != Succs.end())
839         return false;
840       Visited.insert(SU);
841       for (const SDep &SI : SU->Preds)
842         if (SI.getSUnit() != SU && !Visited.count(SI.getSUnit()))
843           Preds.push_back(SI.getSUnit());
844     }
845 
846     return true;
847   }
848 
849   // Link as much SALU intructions in chain as possible. Return the size
850   // of the chain. Links up to MaxChain instructions.
851   unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain,
852                          SmallPtrSetImpl<SUnit *> &Visited) const {
853     SmallVector<SUnit *, 8> Worklist({To});
854     unsigned Linked = 0;
855 
856     while (!Worklist.empty() && MaxChain-- > 0) {
857       SUnit *SU = Worklist.pop_back_val();
858       if (!Visited.insert(SU).second)
859         continue;
860 
861       LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From);
862                  dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n');
863 
864       if (SU->addPred(SDep(From, SDep::Artificial), false))
865         ++Linked;
866 
867       for (SDep &SI : From->Succs) {
868         SUnit *SUv = SI.getSUnit();
869         if (SUv != From && isVALU(SUv) && canAddEdge(SUv, SU))
870           SUv->addPred(SDep(SU, SDep::Artificial), false);
871       }
872 
873       for (SDep &SI : SU->Succs) {
874         SUnit *Succ = SI.getSUnit();
875         if (Succ != SU && isSALU(Succ) && canAddEdge(From, Succ))
876           Worklist.push_back(Succ);
877       }
878     }
879 
880     return Linked;
881   }
882 
883   void apply(ScheduleDAGInstrs *DAGInstrs) override {
884     const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget<GCNSubtarget>();
885     if (!ST.hasMAIInsts() || DisablePowerSched)
886       return;
887     DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
888     const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel();
889     if (!TSchedModel || DAG->SUnits.empty())
890       return;
891 
892     // Scan for MFMA long latency instructions and try to add a dependency
893     // of available SALU instructions to give them a chance to fill MFMA
894     // shadow. That is desirable to fill MFMA shadow with SALU instructions
895     // rather than VALU to prevent power consumption bursts and throttle.
896     auto LastSALU = DAG->SUnits.begin();
897     auto E = DAG->SUnits.end();
898     SmallPtrSet<SUnit*, 32> Visited;
899     for (SUnit &SU : DAG->SUnits) {
900       MachineInstr &MAI = *SU.getInstr();
901       if (!TII->isMAI(MAI) ||
902            MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32 ||
903            MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32)
904         continue;
905 
906       unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1;
907 
908       LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU);
909                  dbgs() << "Need " << Lat
910                         << " instructions to cover latency.\n");
911 
912       // Find up to Lat independent scalar instructions as early as
913       // possible such that they can be scheduled after this MFMA.
914       for ( ; Lat && LastSALU != E; ++LastSALU) {
915         if (Visited.count(&*LastSALU))
916           continue;
917 
918         if (!isSALU(&*LastSALU) || !canAddEdge(&*LastSALU, &SU))
919           continue;
920 
921         Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited);
922       }
923     }
924   }
925 };
926 } // namespace
927 
928 void GCNSubtarget::getPostRAMutations(
929     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
930   Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
931   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
932 }
933 
934 const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
935   if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn)
936     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<GCNSubtarget>());
937   else
938     return static_cast<const AMDGPUSubtarget&>(MF.getSubtarget<R600Subtarget>());
939 }
940 
941 const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Function &F) {
942   if (TM.getTargetTriple().getArch() == Triple::amdgcn)
943     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<GCNSubtarget>(F));
944   else
945     return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
946 }
947