1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/IR/GlobalAlias.h"
33 #include "llvm/IR/GlobalValue.h"
34 #include "llvm/Target/TargetMachine.h"
35 
36 using namespace llvm;
37 using namespace llvm::AMDGPU;
38 
39 #define DEBUG_TYPE "amdgpu-resource-usage"
40 
41 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
42 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
43 
44 // We need to tell the runtime some amount ahead of time if we don't know the
45 // true stack size. Assume a smaller number if this is only due to dynamic /
46 // non-entry block allocas.
47 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
48     "amdgpu-assume-external-call-stack-size",
49     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
50     cl::init(16384));
51 
52 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
53     "amdgpu-assume-dynamic-stack-object-size",
54     cl::desc("Assumed extra stack use if there are any "
55              "variable sized objects (in bytes)"),
56     cl::Hidden, cl::init(4096));
57 
58 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
59                 "Function register usage analysis", true, true)
60 
61 static const Function *getCalleeFunction(const MachineOperand &Op) {
62   if (Op.isImm()) {
63     assert(Op.getImm() == 0);
64     return nullptr;
65   }
66   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
67     return cast<Function>(GA->getOperand(0));
68   return cast<Function>(Op.getGlobal());
69 }
70 
71 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
72                                   const SIInstrInfo &TII, unsigned Reg) {
73   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
74     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
75       return true;
76   }
77 
78   return false;
79 }
80 
81 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
82     const GCNSubtarget &ST) const {
83   return NumExplicitSGPR +
84          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
85                                    ST.getTargetID().isXnackOnOrAny());
86 }
87 
88 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
89     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
90   if (ST.hasGFX90AInsts() && ArgNumAGPR)
91     return alignTo(ArgNumVGPR, 4) + ArgNumAGPR;
92   return std::max(ArgNumVGPR, ArgNumAGPR);
93 }
94 
95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96     const GCNSubtarget &ST) const {
97   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
100 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
101   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102   if (!TPC)
103     return false;
104 
105   const TargetMachine &TM = TPC->getTM<TargetMachine>();
106   bool HasIndirectCall = false;
107 
108   for (CallGraphNode *I : SCC) {
109     Function *F = I->getFunction();
110     if (!F || F->isDeclaration())
111       continue;
112 
113     MachineModuleInfo &MMI =
114         getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
115     MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
116 
117     auto CI = CallGraphResourceInfo.insert(
118         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
119     SIFunctionResourceInfo &Info = CI.first->second;
120     assert(CI.second && "should only be called once per function");
121     Info = analyzeResourceUsage(MF, TM);
122     HasIndirectCall |= Info.HasIndirectCall;
123   }
124 
125   if (HasIndirectCall)
126     propagateIndirectCallRegisterUsage();
127 
128   return false;
129 }
130 
131 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
132 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
133     const MachineFunction &MF, const TargetMachine &TM) const {
134   SIFunctionResourceInfo Info;
135 
136   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
137   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
138   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
139   const MachineRegisterInfo &MRI = MF.getRegInfo();
140   const SIInstrInfo *TII = ST.getInstrInfo();
141   const SIRegisterInfo &TRI = TII->getRegisterInfo();
142 
143   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
144                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
145                          MRI.isLiveIn(MFI->getPreloadedReg(
146                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
147 
148   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
149   // instructions aren't used to access the scratch buffer. Inline assembly may
150   // need it though.
151   //
152   // If we only have implicit uses of flat_scr on flat instructions, it is not
153   // really needed.
154   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
155       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
156        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
157        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
158     Info.UsesFlatScratch = false;
159   }
160 
161   Info.PrivateSegmentSize = FrameInfo.getStackSize();
162 
163   // Assume a big number if there are any unknown sized objects.
164   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
165   if (Info.HasDynamicallySizedStack)
166     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
167 
168   if (MFI->isStackRealigned())
169     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
170 
171   Info.UsesVCC =
172       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
173 
174   // If there are no calls, MachineRegisterInfo can tell us the used register
175   // count easily.
176   // A tail call isn't considered a call for MachineFrameInfo's purposes.
177   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
178     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
179     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
180       if (MRI.isPhysRegUsed(Reg)) {
181         HighestVGPRReg = Reg;
182         break;
183       }
184     }
185 
186     if (ST.hasMAIInsts()) {
187       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
188       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
189         if (MRI.isPhysRegUsed(Reg)) {
190           HighestAGPRReg = Reg;
191           break;
192         }
193       }
194       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
195                          ? 0
196                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
197     }
198 
199     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
200     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
201       if (MRI.isPhysRegUsed(Reg)) {
202         HighestSGPRReg = Reg;
203         break;
204       }
205     }
206 
207     // We found the maximum register index. They start at 0, so add one to get
208     // the number of registers.
209     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
210                        ? 0
211                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
212     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
213                                ? 0
214                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
215 
216     return Info;
217   }
218 
219   int32_t MaxVGPR = -1;
220   int32_t MaxAGPR = -1;
221   int32_t MaxSGPR = -1;
222   uint64_t CalleeFrameSize = 0;
223 
224   for (const MachineBasicBlock &MBB : MF) {
225     for (const MachineInstr &MI : MBB) {
226       // TODO: Check regmasks? Do they occur anywhere except calls?
227       for (const MachineOperand &MO : MI.operands()) {
228         unsigned Width = 0;
229         bool IsSGPR = false;
230         bool IsAGPR = false;
231 
232         if (!MO.isReg())
233           continue;
234 
235         Register Reg = MO.getReg();
236         switch (Reg) {
237         case AMDGPU::EXEC:
238         case AMDGPU::EXEC_LO:
239         case AMDGPU::EXEC_HI:
240         case AMDGPU::SCC:
241         case AMDGPU::M0:
242         case AMDGPU::M0_LO16:
243         case AMDGPU::M0_HI16:
244         case AMDGPU::SRC_SHARED_BASE:
245         case AMDGPU::SRC_SHARED_LIMIT:
246         case AMDGPU::SRC_PRIVATE_BASE:
247         case AMDGPU::SRC_PRIVATE_LIMIT:
248         case AMDGPU::SGPR_NULL:
249         case AMDGPU::MODE:
250           continue;
251 
252         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
253           llvm_unreachable("src_pops_exiting_wave_id should not be used");
254 
255         case AMDGPU::NoRegister:
256           assert(MI.isDebugInstr() &&
257                  "Instruction uses invalid noreg register");
258           continue;
259 
260         case AMDGPU::VCC:
261         case AMDGPU::VCC_LO:
262         case AMDGPU::VCC_HI:
263         case AMDGPU::VCC_LO_LO16:
264         case AMDGPU::VCC_LO_HI16:
265         case AMDGPU::VCC_HI_LO16:
266         case AMDGPU::VCC_HI_HI16:
267           Info.UsesVCC = true;
268           continue;
269 
270         case AMDGPU::FLAT_SCR:
271         case AMDGPU::FLAT_SCR_LO:
272         case AMDGPU::FLAT_SCR_HI:
273           continue;
274 
275         case AMDGPU::XNACK_MASK:
276         case AMDGPU::XNACK_MASK_LO:
277         case AMDGPU::XNACK_MASK_HI:
278           llvm_unreachable("xnack_mask registers should not be used");
279 
280         case AMDGPU::LDS_DIRECT:
281           llvm_unreachable("lds_direct register should not be used");
282 
283         case AMDGPU::TBA:
284         case AMDGPU::TBA_LO:
285         case AMDGPU::TBA_HI:
286         case AMDGPU::TMA:
287         case AMDGPU::TMA_LO:
288         case AMDGPU::TMA_HI:
289           llvm_unreachable("trap handler registers should not be used");
290 
291         case AMDGPU::SRC_VCCZ:
292           llvm_unreachable("src_vccz register should not be used");
293 
294         case AMDGPU::SRC_EXECZ:
295           llvm_unreachable("src_execz register should not be used");
296 
297         case AMDGPU::SRC_SCC:
298           llvm_unreachable("src_scc register should not be used");
299 
300         default:
301           break;
302         }
303 
304         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
305             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
306             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
307           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
308                  "trap handler registers should not be used");
309           IsSGPR = true;
310           Width = 1;
311         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
312                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
313                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
314           IsSGPR = false;
315           Width = 1;
316         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
317                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
318           IsSGPR = false;
319           IsAGPR = true;
320           Width = 1;
321         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
322           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
323                  "trap handler registers should not be used");
324           IsSGPR = true;
325           Width = 2;
326         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
327           IsSGPR = false;
328           Width = 2;
329         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
330           IsSGPR = false;
331           IsAGPR = true;
332           Width = 2;
333         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
334           IsSGPR = false;
335           Width = 3;
336         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
337           IsSGPR = true;
338           Width = 3;
339         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
340           IsSGPR = false;
341           IsAGPR = true;
342           Width = 3;
343         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
344           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
345                  "trap handler registers should not be used");
346           IsSGPR = true;
347           Width = 4;
348         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
349           IsSGPR = false;
350           Width = 4;
351         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
352           IsSGPR = false;
353           IsAGPR = true;
354           Width = 4;
355         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
356           IsSGPR = false;
357           Width = 5;
358         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
359           IsSGPR = true;
360           Width = 5;
361         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
362           IsSGPR = false;
363           IsAGPR = true;
364           Width = 5;
365         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
366           IsSGPR = false;
367           Width = 6;
368         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
369           IsSGPR = true;
370           Width = 6;
371         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
372           IsSGPR = false;
373           IsAGPR = true;
374           Width = 6;
375         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
376           IsSGPR = false;
377           Width = 7;
378         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
379           IsSGPR = true;
380           Width = 7;
381         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
382           IsSGPR = false;
383           IsAGPR = true;
384           Width = 7;
385         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
386           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
387                  "trap handler registers should not be used");
388           IsSGPR = true;
389           Width = 8;
390         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
391           IsSGPR = false;
392           Width = 8;
393         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
394           IsSGPR = false;
395           IsAGPR = true;
396           Width = 8;
397         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
398           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
399                  "trap handler registers should not be used");
400           IsSGPR = true;
401           Width = 16;
402         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
403           IsSGPR = false;
404           Width = 16;
405         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
406           IsSGPR = false;
407           IsAGPR = true;
408           Width = 16;
409         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
410           IsSGPR = true;
411           Width = 32;
412         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
413           IsSGPR = false;
414           Width = 32;
415         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
416           IsSGPR = false;
417           IsAGPR = true;
418           Width = 32;
419         } else {
420           llvm_unreachable("Unknown register class");
421         }
422         unsigned HWReg = TRI.getHWRegIndex(Reg);
423         int MaxUsed = HWReg + Width - 1;
424         if (IsSGPR) {
425           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
426         } else if (IsAGPR) {
427           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
428         } else {
429           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
430         }
431       }
432 
433       if (MI.isCall()) {
434         // Pseudo used just to encode the underlying global. Is there a better
435         // way to track this?
436 
437         const MachineOperand *CalleeOp =
438             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
439 
440         const Function *Callee = getCalleeFunction(*CalleeOp);
441         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
442             CallGraphResourceInfo.end();
443 
444         // Avoid crashing on undefined behavior with an illegal call to a
445         // kernel. If a callsite's calling convention doesn't match the
446         // function's, it's undefined behavior. If the callsite calling
447         // convention does match, that would have errored earlier.
448         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
449           report_fatal_error("invalid call to entry function");
450 
451         bool IsIndirect = !Callee || Callee->isDeclaration();
452         if (!IsIndirect)
453           I = CallGraphResourceInfo.find(Callee);
454 
455         // FIXME: Call site could have norecurse on it
456         if (!Callee || !Callee->doesNotRecurse()) {
457           Info.HasRecursion = true;
458 
459           // TODO: If we happen to know there is no stack usage in the
460           // callgraph, we don't need to assume an infinitely growing stack.
461           if (!MI.isReturn()) {
462             // We don't need to assume an unknown stack size for tail calls.
463 
464             // FIXME: This only benefits in the case where the kernel does not
465             // directly call the tail called function. If a kernel directly
466             // calls a tail recursive function, we'll assume maximum stack size
467             // based on the regular call instruction.
468             CalleeFrameSize =
469               std::max(CalleeFrameSize,
470                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
471           }
472         }
473 
474         if (IsIndirect || I == CallGraphResourceInfo.end()) {
475           CalleeFrameSize =
476               std::max(CalleeFrameSize,
477                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
478 
479           // Register usage of indirect calls gets handled later
480           Info.UsesVCC = true;
481           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
482           Info.HasDynamicallySizedStack = true;
483           Info.HasIndirectCall = true;
484         } else {
485           // We force CodeGen to run in SCC order, so the callee's register
486           // usage etc. should be the cumulative usage of all callees.
487           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
488           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
489           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
490           CalleeFrameSize =
491               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
492           Info.UsesVCC |= I->second.UsesVCC;
493           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
494           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
495           Info.HasRecursion |= I->second.HasRecursion;
496           Info.HasIndirectCall |= I->second.HasIndirectCall;
497         }
498       }
499     }
500   }
501 
502   Info.NumExplicitSGPR = MaxSGPR + 1;
503   Info.NumVGPR = MaxVGPR + 1;
504   Info.NumAGPR = MaxAGPR + 1;
505   Info.PrivateSegmentSize += CalleeFrameSize;
506 
507   return Info;
508 }
509 
510 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
511   // Collect the maximum number of registers from non-hardware-entrypoints.
512   // All these functions are potential targets for indirect calls.
513   int32_t NonKernelMaxSGPRs = 0;
514   int32_t NonKernelMaxVGPRs = 0;
515   int32_t NonKernelMaxAGPRs = 0;
516 
517   for (const auto &I : CallGraphResourceInfo) {
518     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
519       auto &Info = I.getSecond();
520       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
521       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
522       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
523     }
524   }
525 
526   // Add register usage for functions with indirect calls.
527   // For calls to unknown functions, we assume the maximum register usage of
528   // all non-hardware-entrypoints in the current module.
529   for (auto &I : CallGraphResourceInfo) {
530     auto &Info = I.getSecond();
531     if (Info.HasIndirectCall) {
532       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
533       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
534       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
535     }
536   }
537 }
538