1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25 
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/Target/TargetMachine.h"
33 
34 using namespace llvm;
35 using namespace llvm::AMDGPU;
36 
37 #define DEBUG_TYPE "amdgpu-resource-usage"
38 
39 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
40 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
41 
42 // We need to tell the runtime some amount ahead of time if we don't know the
43 // true stack size. Assume a smaller number if this is only due to dynamic /
44 // non-entry block allocas.
45 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
46     "amdgpu-assume-external-call-stack-size",
47     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
48     cl::init(16384));
49 
50 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
51     "amdgpu-assume-dynamic-stack-object-size",
52     cl::desc("Assumed extra stack use if there are any "
53              "variable sized objects (in bytes)"),
54     cl::Hidden, cl::init(4096));
55 
56 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
57                 "Function register usage analysis", true, true)
58 
getCalleeFunction(const MachineOperand & Op)59 static const Function *getCalleeFunction(const MachineOperand &Op) {
60   if (Op.isImm()) {
61     assert(Op.getImm() == 0);
62     return nullptr;
63   }
64 
65   return cast<Function>(Op.getGlobal());
66 }
67 
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)68 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
69                                   const SIInstrInfo &TII, unsigned Reg) {
70   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
71     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
72       return true;
73   }
74 
75   return false;
76 }
77 
getTotalNumSGPRs(const GCNSubtarget & ST) const78 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
79     const GCNSubtarget &ST) const {
80   return NumExplicitSGPR +
81          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
82                                    ST.getTargetID().isXnackOnOrAny());
83 }
84 
getTotalNumVGPRs(const GCNSubtarget & ST) const85 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
86     const GCNSubtarget &ST) const {
87   if (ST.hasGFX90AInsts() && NumAGPR)
88     return alignTo(NumVGPR, 4) + NumAGPR;
89   return std::max(NumVGPR, NumAGPR);
90 }
91 
runOnSCC(CallGraphSCC & SCC)92 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
93   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
94   if (!TPC)
95     return false;
96 
97   const TargetMachine &TM = TPC->getTM<TargetMachine>();
98   bool HasIndirectCall = false;
99 
100   for (CallGraphNode *I : SCC) {
101     Function *F = I->getFunction();
102     if (!F || F->isDeclaration())
103       continue;
104 
105     MachineModuleInfo &MMI =
106         getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
107     MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
108 
109     auto CI = CallGraphResourceInfo.insert(
110         std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
111     SIFunctionResourceInfo &Info = CI.first->second;
112     assert(CI.second && "should only be called once per function");
113     Info = analyzeResourceUsage(MF, TM);
114     HasIndirectCall |= Info.HasIndirectCall;
115   }
116 
117   if (HasIndirectCall)
118     propagateIndirectCallRegisterUsage();
119 
120   return false;
121 }
122 
123 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,const TargetMachine & TM) const124 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
125     const MachineFunction &MF, const TargetMachine &TM) const {
126   SIFunctionResourceInfo Info;
127 
128   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
129   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
130   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
131   const MachineRegisterInfo &MRI = MF.getRegInfo();
132   const SIInstrInfo *TII = ST.getInstrInfo();
133   const SIRegisterInfo &TRI = TII->getRegisterInfo();
134 
135   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
136                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
137                          MRI.isLiveIn(MFI->getPreloadedReg(
138                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
139 
140   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
141   // instructions aren't used to access the scratch buffer. Inline assembly may
142   // need it though.
143   //
144   // If we only have implicit uses of flat_scr on flat instructions, it is not
145   // really needed.
146   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
147       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
148        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
149        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
150     Info.UsesFlatScratch = false;
151   }
152 
153   Info.PrivateSegmentSize = FrameInfo.getStackSize();
154 
155   // Assume a big number if there are any unknown sized objects.
156   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
157   if (Info.HasDynamicallySizedStack)
158     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
159 
160   if (MFI->isStackRealigned())
161     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
162 
163   Info.UsesVCC =
164       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
165 
166   // If there are no calls, MachineRegisterInfo can tell us the used register
167   // count easily.
168   // A tail call isn't considered a call for MachineFrameInfo's purposes.
169   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
170     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
171     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
172       if (MRI.isPhysRegUsed(Reg)) {
173         HighestVGPRReg = Reg;
174         break;
175       }
176     }
177 
178     if (ST.hasMAIInsts()) {
179       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
180       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
181         if (MRI.isPhysRegUsed(Reg)) {
182           HighestAGPRReg = Reg;
183           break;
184         }
185       }
186       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
187                          ? 0
188                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
189     }
190 
191     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
192     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
193       if (MRI.isPhysRegUsed(Reg)) {
194         HighestSGPRReg = Reg;
195         break;
196       }
197     }
198 
199     // We found the maximum register index. They start at 0, so add one to get
200     // the number of registers.
201     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
202                        ? 0
203                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
204     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
205                                ? 0
206                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
207 
208     return Info;
209   }
210 
211   int32_t MaxVGPR = -1;
212   int32_t MaxAGPR = -1;
213   int32_t MaxSGPR = -1;
214   uint64_t CalleeFrameSize = 0;
215 
216   for (const MachineBasicBlock &MBB : MF) {
217     for (const MachineInstr &MI : MBB) {
218       // TODO: Check regmasks? Do they occur anywhere except calls?
219       for (const MachineOperand &MO : MI.operands()) {
220         unsigned Width = 0;
221         bool IsSGPR = false;
222         bool IsAGPR = false;
223 
224         if (!MO.isReg())
225           continue;
226 
227         Register Reg = MO.getReg();
228         switch (Reg) {
229         case AMDGPU::EXEC:
230         case AMDGPU::EXEC_LO:
231         case AMDGPU::EXEC_HI:
232         case AMDGPU::SCC:
233         case AMDGPU::M0:
234         case AMDGPU::M0_LO16:
235         case AMDGPU::M0_HI16:
236         case AMDGPU::SRC_SHARED_BASE:
237         case AMDGPU::SRC_SHARED_LIMIT:
238         case AMDGPU::SRC_PRIVATE_BASE:
239         case AMDGPU::SRC_PRIVATE_LIMIT:
240         case AMDGPU::SGPR_NULL:
241         case AMDGPU::MODE:
242           continue;
243 
244         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
245           llvm_unreachable("src_pops_exiting_wave_id should not be used");
246 
247         case AMDGPU::NoRegister:
248           assert(MI.isDebugInstr() &&
249                  "Instruction uses invalid noreg register");
250           continue;
251 
252         case AMDGPU::VCC:
253         case AMDGPU::VCC_LO:
254         case AMDGPU::VCC_HI:
255         case AMDGPU::VCC_LO_LO16:
256         case AMDGPU::VCC_LO_HI16:
257         case AMDGPU::VCC_HI_LO16:
258         case AMDGPU::VCC_HI_HI16:
259           Info.UsesVCC = true;
260           continue;
261 
262         case AMDGPU::FLAT_SCR:
263         case AMDGPU::FLAT_SCR_LO:
264         case AMDGPU::FLAT_SCR_HI:
265           continue;
266 
267         case AMDGPU::XNACK_MASK:
268         case AMDGPU::XNACK_MASK_LO:
269         case AMDGPU::XNACK_MASK_HI:
270           llvm_unreachable("xnack_mask registers should not be used");
271 
272         case AMDGPU::LDS_DIRECT:
273           llvm_unreachable("lds_direct register should not be used");
274 
275         case AMDGPU::TBA:
276         case AMDGPU::TBA_LO:
277         case AMDGPU::TBA_HI:
278         case AMDGPU::TMA:
279         case AMDGPU::TMA_LO:
280         case AMDGPU::TMA_HI:
281           llvm_unreachable("trap handler registers should not be used");
282 
283         case AMDGPU::SRC_VCCZ:
284           llvm_unreachable("src_vccz register should not be used");
285 
286         case AMDGPU::SRC_EXECZ:
287           llvm_unreachable("src_execz register should not be used");
288 
289         case AMDGPU::SRC_SCC:
290           llvm_unreachable("src_scc register should not be used");
291 
292         default:
293           break;
294         }
295 
296         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
297             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
298             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
299           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
300                  "trap handler registers should not be used");
301           IsSGPR = true;
302           Width = 1;
303         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
304                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
305                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
306           IsSGPR = false;
307           Width = 1;
308         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
309                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
310           IsSGPR = false;
311           IsAGPR = true;
312           Width = 1;
313         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
314           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
315                  "trap handler registers should not be used");
316           IsSGPR = true;
317           Width = 2;
318         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
319           IsSGPR = false;
320           Width = 2;
321         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
322           IsSGPR = false;
323           IsAGPR = true;
324           Width = 2;
325         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
326           IsSGPR = false;
327           Width = 3;
328         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
329           IsSGPR = true;
330           Width = 3;
331         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
332           IsSGPR = false;
333           IsAGPR = true;
334           Width = 3;
335         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
336           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
337                  "trap handler registers should not be used");
338           IsSGPR = true;
339           Width = 4;
340         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
341           IsSGPR = false;
342           Width = 4;
343         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
344           IsSGPR = false;
345           IsAGPR = true;
346           Width = 4;
347         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
348           IsSGPR = false;
349           Width = 5;
350         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
351           IsSGPR = true;
352           Width = 5;
353         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
354           IsSGPR = false;
355           IsAGPR = true;
356           Width = 5;
357         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
358           IsSGPR = false;
359           Width = 6;
360         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
361           IsSGPR = true;
362           Width = 6;
363         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
364           IsSGPR = false;
365           IsAGPR = true;
366           Width = 6;
367         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
368           IsSGPR = false;
369           Width = 7;
370         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
371           IsSGPR = true;
372           Width = 7;
373         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
374           IsSGPR = false;
375           IsAGPR = true;
376           Width = 7;
377         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
378           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
379                  "trap handler registers should not be used");
380           IsSGPR = true;
381           Width = 8;
382         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
383           IsSGPR = false;
384           Width = 8;
385         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
386           IsSGPR = false;
387           IsAGPR = true;
388           Width = 8;
389         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
390           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
391                  "trap handler registers should not be used");
392           IsSGPR = true;
393           Width = 16;
394         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
395           IsSGPR = false;
396           Width = 16;
397         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
398           IsSGPR = false;
399           IsAGPR = true;
400           Width = 16;
401         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
402           IsSGPR = true;
403           Width = 32;
404         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
405           IsSGPR = false;
406           Width = 32;
407         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
408           IsSGPR = false;
409           IsAGPR = true;
410           Width = 32;
411         } else {
412           llvm_unreachable("Unknown register class");
413         }
414         unsigned HWReg = TRI.getHWRegIndex(Reg);
415         int MaxUsed = HWReg + Width - 1;
416         if (IsSGPR) {
417           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
418         } else if (IsAGPR) {
419           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
420         } else {
421           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
422         }
423       }
424 
425       if (MI.isCall()) {
426         // Pseudo used just to encode the underlying global. Is there a better
427         // way to track this?
428 
429         const MachineOperand *CalleeOp =
430             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
431 
432         const Function *Callee = getCalleeFunction(*CalleeOp);
433         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
434             CallGraphResourceInfo.end();
435 
436         // Avoid crashing on undefined behavior with an illegal call to a
437         // kernel. If a callsite's calling convention doesn't match the
438         // function's, it's undefined behavior. If the callsite calling
439         // convention does match, that would have errored earlier.
440         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
441           report_fatal_error("invalid call to entry function");
442 
443         bool IsIndirect = !Callee || Callee->isDeclaration();
444         if (!IsIndirect)
445           I = CallGraphResourceInfo.find(Callee);
446 
447         if (IsIndirect || I == CallGraphResourceInfo.end()) {
448           CalleeFrameSize =
449               std::max(CalleeFrameSize,
450                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
451 
452           // Register usage of indirect calls gets handled later
453           Info.UsesVCC = true;
454           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
455           Info.HasDynamicallySizedStack = true;
456           Info.HasIndirectCall = true;
457         } else {
458           // We force CodeGen to run in SCC order, so the callee's register
459           // usage etc. should be the cumulative usage of all callees.
460           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
461           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
462           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
463           CalleeFrameSize =
464               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
465           Info.UsesVCC |= I->second.UsesVCC;
466           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
467           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
468           Info.HasRecursion |= I->second.HasRecursion;
469           Info.HasIndirectCall |= I->second.HasIndirectCall;
470         }
471 
472         // FIXME: Call site could have norecurse on it
473         if (!Callee || !Callee->doesNotRecurse())
474           Info.HasRecursion = true;
475       }
476     }
477   }
478 
479   Info.NumExplicitSGPR = MaxSGPR + 1;
480   Info.NumVGPR = MaxVGPR + 1;
481   Info.NumAGPR = MaxAGPR + 1;
482   Info.PrivateSegmentSize += CalleeFrameSize;
483 
484   return Info;
485 }
486 
propagateIndirectCallRegisterUsage()487 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
488   // Collect the maximum number of registers from non-hardware-entrypoints.
489   // All these functions are potential targets for indirect calls.
490   int32_t NonKernelMaxSGPRs = 0;
491   int32_t NonKernelMaxVGPRs = 0;
492   int32_t NonKernelMaxAGPRs = 0;
493 
494   for (const auto &I : CallGraphResourceInfo) {
495     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
496       auto &Info = I.getSecond();
497       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
498       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
499       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
500     }
501   }
502 
503   // Add register usage for functions with indirect calls.
504   // For calls to unknown functions, we assume the maximum register usage of
505   // all non-hardware-entrypoints in the current module.
506   for (auto &I : CallGraphResourceInfo) {
507     auto &Info = I.getSecond();
508     if (Info.HasIndirectCall) {
509       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
510       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
511       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
512     }
513   }
514 }
515