1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/ADT/PostOrderIterator.h"
31 #include "llvm/Analysis/CallGraph.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/TargetPassConfig.h"
34 #include "llvm/IR/GlobalAlias.h"
35 #include "llvm/IR/GlobalValue.h"
36 #include "llvm/Target/TargetMachine.h"
37 
38 using namespace llvm;
39 using namespace llvm::AMDGPU;
40 
41 #define DEBUG_TYPE "amdgpu-resource-usage"
42 
43 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
44 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
45 
46 // We need to tell the runtime some amount ahead of time if we don't know the
47 // true stack size. Assume a smaller number if this is only due to dynamic /
48 // non-entry block allocas.
49 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
50     "amdgpu-assume-external-call-stack-size",
51     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
52     cl::init(16384));
53 
54 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
55     "amdgpu-assume-dynamic-stack-object-size",
56     cl::desc("Assumed extra stack use if there are any "
57              "variable sized objects (in bytes)"),
58     cl::Hidden, cl::init(4096));
59 
60 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
61                 "Function register usage analysis", true, true)
62 
63 static const Function *getCalleeFunction(const MachineOperand &Op) {
64   if (Op.isImm()) {
65     assert(Op.getImm() == 0);
66     return nullptr;
67   }
68   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69     return cast<Function>(GA->getOperand(0));
70   return cast<Function>(Op.getGlobal());
71 }
72 
73 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
74                                   const SIInstrInfo &TII, unsigned Reg) {
75   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
76     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
77       return true;
78   }
79 
80   return false;
81 }
82 
83 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
84     const GCNSubtarget &ST) const {
85   return NumExplicitSGPR +
86          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
87                                    ST.getTargetID().isXnackOnOrAny());
88 }
89 
90 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
93 }
94 
95 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96     const GCNSubtarget &ST) const {
97   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98 }
99 
100 bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
101   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
102   if (!TPC)
103     return false;
104 
105   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
106   const TargetMachine &TM = TPC->getTM<TargetMachine>();
107   bool HasIndirectCall = false;
108 
109   CallGraph CG = CallGraph(M);
110   auto End = po_end(&CG);
111 
112   for (auto IT = po_begin(&CG); IT != End; ++IT) {
113     Function *F = IT->getFunction();
114     if (!F || F->isDeclaration())
115       continue;
116 
117     MachineFunction *MF = MMI.getMachineFunction(*F);
118     assert(MF && "function must have been generated already");
119 
120     auto CI = CallGraphResourceInfo.insert(
121         std::make_pair(F, SIFunctionResourceInfo()));
122     SIFunctionResourceInfo &Info = CI.first->second;
123     assert(CI.second && "should only be called once per function");
124     Info = analyzeResourceUsage(*MF, TM);
125     HasIndirectCall |= Info.HasIndirectCall;
126   }
127 
128   if (HasIndirectCall)
129     propagateIndirectCallRegisterUsage();
130 
131   return false;
132 }
133 
134 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
135 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
136     const MachineFunction &MF, const TargetMachine &TM) const {
137   SIFunctionResourceInfo Info;
138 
139   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
140   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
141   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
142   const MachineRegisterInfo &MRI = MF.getRegInfo();
143   const SIInstrInfo *TII = ST.getInstrInfo();
144   const SIRegisterInfo &TRI = TII->getRegisterInfo();
145 
146   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
147                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
148                          MRI.isLiveIn(MFI->getPreloadedReg(
149                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
150 
151   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
152   // instructions aren't used to access the scratch buffer. Inline assembly may
153   // need it though.
154   //
155   // If we only have implicit uses of flat_scr on flat instructions, it is not
156   // really needed.
157   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
158       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
159        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
160        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
161     Info.UsesFlatScratch = false;
162   }
163 
164   Info.PrivateSegmentSize = FrameInfo.getStackSize();
165 
166   // Assume a big number if there are any unknown sized objects.
167   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
168   if (Info.HasDynamicallySizedStack)
169     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
170 
171   if (MFI->isStackRealigned())
172     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
173 
174   Info.UsesVCC =
175       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
176 
177   // If there are no calls, MachineRegisterInfo can tell us the used register
178   // count easily.
179   // A tail call isn't considered a call for MachineFrameInfo's purposes.
180   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
181     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
182     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
183       if (MRI.isPhysRegUsed(Reg)) {
184         HighestVGPRReg = Reg;
185         break;
186       }
187     }
188 
189     if (ST.hasMAIInsts()) {
190       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
191       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
192         if (MRI.isPhysRegUsed(Reg)) {
193           HighestAGPRReg = Reg;
194           break;
195         }
196       }
197       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
198                          ? 0
199                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
200     }
201 
202     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
203     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
204       if (MRI.isPhysRegUsed(Reg)) {
205         HighestSGPRReg = Reg;
206         break;
207       }
208     }
209 
210     // We found the maximum register index. They start at 0, so add one to get
211     // the number of registers.
212     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
213                        ? 0
214                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
215     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
216                                ? 0
217                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
218 
219     return Info;
220   }
221 
222   int32_t MaxVGPR = -1;
223   int32_t MaxAGPR = -1;
224   int32_t MaxSGPR = -1;
225   uint64_t CalleeFrameSize = 0;
226 
227   for (const MachineBasicBlock &MBB : MF) {
228     for (const MachineInstr &MI : MBB) {
229       // TODO: Check regmasks? Do they occur anywhere except calls?
230       for (const MachineOperand &MO : MI.operands()) {
231         unsigned Width = 0;
232         bool IsSGPR = false;
233         bool IsAGPR = false;
234 
235         if (!MO.isReg())
236           continue;
237 
238         Register Reg = MO.getReg();
239         switch (Reg) {
240         case AMDGPU::EXEC:
241         case AMDGPU::EXEC_LO:
242         case AMDGPU::EXEC_HI:
243         case AMDGPU::SCC:
244         case AMDGPU::M0:
245         case AMDGPU::M0_LO16:
246         case AMDGPU::M0_HI16:
247         case AMDGPU::SRC_SHARED_BASE:
248         case AMDGPU::SRC_SHARED_LIMIT:
249         case AMDGPU::SRC_PRIVATE_BASE:
250         case AMDGPU::SRC_PRIVATE_LIMIT:
251         case AMDGPU::SGPR_NULL:
252         case AMDGPU::SGPR_NULL64:
253         case AMDGPU::MODE:
254           continue;
255 
256         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
257           llvm_unreachable("src_pops_exiting_wave_id should not be used");
258 
259         case AMDGPU::NoRegister:
260           assert(MI.isDebugInstr() &&
261                  "Instruction uses invalid noreg register");
262           continue;
263 
264         case AMDGPU::VCC:
265         case AMDGPU::VCC_LO:
266         case AMDGPU::VCC_HI:
267         case AMDGPU::VCC_LO_LO16:
268         case AMDGPU::VCC_LO_HI16:
269         case AMDGPU::VCC_HI_LO16:
270         case AMDGPU::VCC_HI_HI16:
271           Info.UsesVCC = true;
272           continue;
273 
274         case AMDGPU::FLAT_SCR:
275         case AMDGPU::FLAT_SCR_LO:
276         case AMDGPU::FLAT_SCR_HI:
277           continue;
278 
279         case AMDGPU::XNACK_MASK:
280         case AMDGPU::XNACK_MASK_LO:
281         case AMDGPU::XNACK_MASK_HI:
282           llvm_unreachable("xnack_mask registers should not be used");
283 
284         case AMDGPU::LDS_DIRECT:
285           llvm_unreachable("lds_direct register should not be used");
286 
287         case AMDGPU::TBA:
288         case AMDGPU::TBA_LO:
289         case AMDGPU::TBA_HI:
290         case AMDGPU::TMA:
291         case AMDGPU::TMA_LO:
292         case AMDGPU::TMA_HI:
293           llvm_unreachable("trap handler registers should not be used");
294 
295         case AMDGPU::SRC_VCCZ:
296           llvm_unreachable("src_vccz register should not be used");
297 
298         case AMDGPU::SRC_EXECZ:
299           llvm_unreachable("src_execz register should not be used");
300 
301         case AMDGPU::SRC_SCC:
302           llvm_unreachable("src_scc register should not be used");
303 
304         default:
305           break;
306         }
307 
308         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
309             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
310             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
311           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
312                  "trap handler registers should not be used");
313           IsSGPR = true;
314           Width = 1;
315         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
316                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
317                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
318           IsSGPR = false;
319           Width = 1;
320         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
321                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
322           IsSGPR = false;
323           IsAGPR = true;
324           Width = 1;
325         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
326           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
327                  "trap handler registers should not be used");
328           IsSGPR = true;
329           Width = 2;
330         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
331           IsSGPR = false;
332           Width = 2;
333         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
334           IsSGPR = false;
335           IsAGPR = true;
336           Width = 2;
337         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
338           IsSGPR = false;
339           Width = 3;
340         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
341           IsSGPR = true;
342           Width = 3;
343         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
344           IsSGPR = false;
345           IsAGPR = true;
346           Width = 3;
347         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
348           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
349                  "trap handler registers should not be used");
350           IsSGPR = true;
351           Width = 4;
352         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
353           IsSGPR = false;
354           Width = 4;
355         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
356           IsSGPR = false;
357           IsAGPR = true;
358           Width = 4;
359         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
360           IsSGPR = false;
361           Width = 5;
362         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
363           IsSGPR = true;
364           Width = 5;
365         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
366           IsSGPR = false;
367           IsAGPR = true;
368           Width = 5;
369         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
370           IsSGPR = false;
371           Width = 6;
372         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
373           IsSGPR = true;
374           Width = 6;
375         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
376           IsSGPR = false;
377           IsAGPR = true;
378           Width = 6;
379         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
380           IsSGPR = false;
381           Width = 7;
382         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
383           IsSGPR = true;
384           Width = 7;
385         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
386           IsSGPR = false;
387           IsAGPR = true;
388           Width = 7;
389         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
390           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
391                  "trap handler registers should not be used");
392           IsSGPR = true;
393           Width = 8;
394         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
395           IsSGPR = false;
396           Width = 8;
397         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
398           IsSGPR = false;
399           IsAGPR = true;
400           Width = 8;
401         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
402           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
403                  "trap handler registers should not be used");
404           IsSGPR = true;
405           Width = 16;
406         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
407           IsSGPR = false;
408           Width = 16;
409         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
410           IsSGPR = false;
411           IsAGPR = true;
412           Width = 16;
413         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
414           IsSGPR = true;
415           Width = 32;
416         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
417           IsSGPR = false;
418           Width = 32;
419         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
420           IsSGPR = false;
421           IsAGPR = true;
422           Width = 32;
423         } else {
424           llvm_unreachable("Unknown register class");
425         }
426         unsigned HWReg = TRI.getHWRegIndex(Reg);
427         int MaxUsed = HWReg + Width - 1;
428         if (IsSGPR) {
429           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
430         } else if (IsAGPR) {
431           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
432         } else {
433           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
434         }
435       }
436 
437       if (MI.isCall()) {
438         // Pseudo used just to encode the underlying global. Is there a better
439         // way to track this?
440 
441         const MachineOperand *CalleeOp =
442             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
443 
444         const Function *Callee = getCalleeFunction(*CalleeOp);
445         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
446             CallGraphResourceInfo.end();
447 
448         // Avoid crashing on undefined behavior with an illegal call to a
449         // kernel. If a callsite's calling convention doesn't match the
450         // function's, it's undefined behavior. If the callsite calling
451         // convention does match, that would have errored earlier.
452         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
453           report_fatal_error("invalid call to entry function");
454 
455         bool IsIndirect = !Callee || Callee->isDeclaration();
456         if (!IsIndirect)
457           I = CallGraphResourceInfo.find(Callee);
458 
459         // FIXME: Call site could have norecurse on it
460         if (!Callee || !Callee->doesNotRecurse()) {
461           Info.HasRecursion = true;
462 
463           // TODO: If we happen to know there is no stack usage in the
464           // callgraph, we don't need to assume an infinitely growing stack.
465           if (!MI.isReturn()) {
466             // We don't need to assume an unknown stack size for tail calls.
467 
468             // FIXME: This only benefits in the case where the kernel does not
469             // directly call the tail called function. If a kernel directly
470             // calls a tail recursive function, we'll assume maximum stack size
471             // based on the regular call instruction.
472             CalleeFrameSize =
473               std::max(CalleeFrameSize,
474                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
475           }
476         }
477 
478         if (IsIndirect || I == CallGraphResourceInfo.end()) {
479           CalleeFrameSize =
480               std::max(CalleeFrameSize,
481                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
482 
483           // Register usage of indirect calls gets handled later
484           Info.UsesVCC = true;
485           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
486           Info.HasDynamicallySizedStack = true;
487           Info.HasIndirectCall = true;
488         } else {
489           // We force CodeGen to run in SCC order, so the callee's register
490           // usage etc. should be the cumulative usage of all callees.
491           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
492           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
493           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
494           CalleeFrameSize =
495               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
496           Info.UsesVCC |= I->second.UsesVCC;
497           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
498           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
499           Info.HasRecursion |= I->second.HasRecursion;
500           Info.HasIndirectCall |= I->second.HasIndirectCall;
501         }
502       }
503     }
504   }
505 
506   Info.NumExplicitSGPR = MaxSGPR + 1;
507   Info.NumVGPR = MaxVGPR + 1;
508   Info.NumAGPR = MaxAGPR + 1;
509   Info.PrivateSegmentSize += CalleeFrameSize;
510 
511   return Info;
512 }
513 
514 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
515   // Collect the maximum number of registers from non-hardware-entrypoints.
516   // All these functions are potential targets for indirect calls.
517   int32_t NonKernelMaxSGPRs = 0;
518   int32_t NonKernelMaxVGPRs = 0;
519   int32_t NonKernelMaxAGPRs = 0;
520 
521   for (const auto &I : CallGraphResourceInfo) {
522     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
523       auto &Info = I.getSecond();
524       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
525       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
526       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
527     }
528   }
529 
530   // Add register usage for functions with indirect calls.
531   // For calls to unknown functions, we assume the maximum register usage of
532   // all non-hardware-entrypoints in the current module.
533   for (auto &I : CallGraphResourceInfo) {
534     auto &Info = I.getSecond();
535     if (Info.HasIndirectCall) {
536       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
537       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
538       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
539     }
540   }
541 }
542