1 //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Analyzes how many registers and other resources are used by
11 /// functions.
12 ///
13 /// The results of this analysis are used to fill the register usage, flat
14 /// usage, etc. into hardware registers.
15 ///
16 /// The analysis takes callees into account. E.g. if a function A that needs 10
17 /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
18 /// will return 20.
19 /// It is assumed that an indirect call can go into any function except
20 /// hardware-entrypoints. Therefore the register usage of functions with
21 /// indirect calls is estimated as the maximum of all non-entrypoint functions
22 /// in the module.
23 ///
24 //===----------------------------------------------------------------------===//
25
26 #include "AMDGPUResourceUsageAnalysis.h"
27 #include "AMDGPU.h"
28 #include "GCNSubtarget.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "llvm/Analysis/CallGraph.h"
31 #include "llvm/CodeGen/TargetPassConfig.h"
32 #include "llvm/Target/TargetMachine.h"
33
34 using namespace llvm;
35 using namespace llvm::AMDGPU;
36
37 #define DEBUG_TYPE "amdgpu-resource-usage"
38
39 char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
40 char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
41
42 // We need to tell the runtime some amount ahead of time if we don't know the
43 // true stack size. Assume a smaller number if this is only due to dynamic /
44 // non-entry block allocas.
45 static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
46 "amdgpu-assume-external-call-stack-size",
47 cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
48 cl::init(16384));
49
50 static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
51 "amdgpu-assume-dynamic-stack-object-size",
52 cl::desc("Assumed extra stack use if there are any "
53 "variable sized objects (in bytes)"),
54 cl::Hidden, cl::init(4096));
55
56 INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
57 "Function register usage analysis", true, true)
58
getCalleeFunction(const MachineOperand & Op)59 static const Function *getCalleeFunction(const MachineOperand &Op) {
60 if (Op.isImm()) {
61 assert(Op.getImm() == 0);
62 return nullptr;
63 }
64
65 return cast<Function>(Op.getGlobal());
66 }
67
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)68 static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
69 const SIInstrInfo &TII, unsigned Reg) {
70 for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
71 if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
72 return true;
73 }
74
75 return false;
76 }
77
getTotalNumSGPRs(const GCNSubtarget & ST) const78 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
79 const GCNSubtarget &ST) const {
80 return NumExplicitSGPR +
81 IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
82 ST.getTargetID().isXnackOnOrAny());
83 }
84
getTotalNumVGPRs(const GCNSubtarget & ST) const85 int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
86 const GCNSubtarget &ST) const {
87 if (ST.hasGFX90AInsts() && NumAGPR)
88 return alignTo(NumVGPR, 4) + NumAGPR;
89 return std::max(NumVGPR, NumAGPR);
90 }
91
runOnSCC(CallGraphSCC & SCC)92 bool AMDGPUResourceUsageAnalysis::runOnSCC(CallGraphSCC &SCC) {
93 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
94 if (!TPC)
95 return false;
96
97 const TargetMachine &TM = TPC->getTM<TargetMachine>();
98 bool HasIndirectCall = false;
99
100 for (CallGraphNode *I : SCC) {
101 Function *F = I->getFunction();
102 if (!F || F->isDeclaration())
103 continue;
104
105 MachineModuleInfo &MMI =
106 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
107 MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
108
109 auto CI = CallGraphResourceInfo.insert(
110 std::make_pair(&MF.getFunction(), SIFunctionResourceInfo()));
111 SIFunctionResourceInfo &Info = CI.first->second;
112 assert(CI.second && "should only be called once per function");
113 Info = analyzeResourceUsage(MF, TM);
114 HasIndirectCall |= Info.HasIndirectCall;
115 }
116
117 if (HasIndirectCall)
118 propagateIndirectCallRegisterUsage();
119
120 return false;
121 }
122
123 AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,const TargetMachine & TM) const124 AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
125 const MachineFunction &MF, const TargetMachine &TM) const {
126 SIFunctionResourceInfo Info;
127
128 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
129 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
130 const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
131 const MachineRegisterInfo &MRI = MF.getRegInfo();
132 const SIInstrInfo *TII = ST.getInstrInfo();
133 const SIRegisterInfo &TRI = TII->getRegisterInfo();
134
135 Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
136 MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
137 MRI.isLiveIn(MFI->getPreloadedReg(
138 AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
139
140 // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
141 // instructions aren't used to access the scratch buffer. Inline assembly may
142 // need it though.
143 //
144 // If we only have implicit uses of flat_scr on flat instructions, it is not
145 // really needed.
146 if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
147 (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
148 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
149 !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
150 Info.UsesFlatScratch = false;
151 }
152
153 Info.PrivateSegmentSize = FrameInfo.getStackSize();
154
155 // Assume a big number if there are any unknown sized objects.
156 Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
157 if (Info.HasDynamicallySizedStack)
158 Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
159
160 if (MFI->isStackRealigned())
161 Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
162
163 Info.UsesVCC =
164 MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
165
166 // If there are no calls, MachineRegisterInfo can tell us the used register
167 // count easily.
168 // A tail call isn't considered a call for MachineFrameInfo's purposes.
169 if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
170 MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
171 for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
172 if (MRI.isPhysRegUsed(Reg)) {
173 HighestVGPRReg = Reg;
174 break;
175 }
176 }
177
178 if (ST.hasMAIInsts()) {
179 MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
180 for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
181 if (MRI.isPhysRegUsed(Reg)) {
182 HighestAGPRReg = Reg;
183 break;
184 }
185 }
186 Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
187 ? 0
188 : TRI.getHWRegIndex(HighestAGPRReg) + 1;
189 }
190
191 MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
192 for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
193 if (MRI.isPhysRegUsed(Reg)) {
194 HighestSGPRReg = Reg;
195 break;
196 }
197 }
198
199 // We found the maximum register index. They start at 0, so add one to get
200 // the number of registers.
201 Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
202 ? 0
203 : TRI.getHWRegIndex(HighestVGPRReg) + 1;
204 Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
205 ? 0
206 : TRI.getHWRegIndex(HighestSGPRReg) + 1;
207
208 return Info;
209 }
210
211 int32_t MaxVGPR = -1;
212 int32_t MaxAGPR = -1;
213 int32_t MaxSGPR = -1;
214 uint64_t CalleeFrameSize = 0;
215
216 for (const MachineBasicBlock &MBB : MF) {
217 for (const MachineInstr &MI : MBB) {
218 // TODO: Check regmasks? Do they occur anywhere except calls?
219 for (const MachineOperand &MO : MI.operands()) {
220 unsigned Width = 0;
221 bool IsSGPR = false;
222 bool IsAGPR = false;
223
224 if (!MO.isReg())
225 continue;
226
227 Register Reg = MO.getReg();
228 switch (Reg) {
229 case AMDGPU::EXEC:
230 case AMDGPU::EXEC_LO:
231 case AMDGPU::EXEC_HI:
232 case AMDGPU::SCC:
233 case AMDGPU::M0:
234 case AMDGPU::M0_LO16:
235 case AMDGPU::M0_HI16:
236 case AMDGPU::SRC_SHARED_BASE:
237 case AMDGPU::SRC_SHARED_LIMIT:
238 case AMDGPU::SRC_PRIVATE_BASE:
239 case AMDGPU::SRC_PRIVATE_LIMIT:
240 case AMDGPU::SGPR_NULL:
241 case AMDGPU::MODE:
242 continue;
243
244 case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
245 llvm_unreachable("src_pops_exiting_wave_id should not be used");
246
247 case AMDGPU::NoRegister:
248 assert(MI.isDebugInstr() &&
249 "Instruction uses invalid noreg register");
250 continue;
251
252 case AMDGPU::VCC:
253 case AMDGPU::VCC_LO:
254 case AMDGPU::VCC_HI:
255 case AMDGPU::VCC_LO_LO16:
256 case AMDGPU::VCC_LO_HI16:
257 case AMDGPU::VCC_HI_LO16:
258 case AMDGPU::VCC_HI_HI16:
259 Info.UsesVCC = true;
260 continue;
261
262 case AMDGPU::FLAT_SCR:
263 case AMDGPU::FLAT_SCR_LO:
264 case AMDGPU::FLAT_SCR_HI:
265 continue;
266
267 case AMDGPU::XNACK_MASK:
268 case AMDGPU::XNACK_MASK_LO:
269 case AMDGPU::XNACK_MASK_HI:
270 llvm_unreachable("xnack_mask registers should not be used");
271
272 case AMDGPU::LDS_DIRECT:
273 llvm_unreachable("lds_direct register should not be used");
274
275 case AMDGPU::TBA:
276 case AMDGPU::TBA_LO:
277 case AMDGPU::TBA_HI:
278 case AMDGPU::TMA:
279 case AMDGPU::TMA_LO:
280 case AMDGPU::TMA_HI:
281 llvm_unreachable("trap handler registers should not be used");
282
283 case AMDGPU::SRC_VCCZ:
284 llvm_unreachable("src_vccz register should not be used");
285
286 case AMDGPU::SRC_EXECZ:
287 llvm_unreachable("src_execz register should not be used");
288
289 case AMDGPU::SRC_SCC:
290 llvm_unreachable("src_scc register should not be used");
291
292 default:
293 break;
294 }
295
296 if (AMDGPU::SReg_32RegClass.contains(Reg) ||
297 AMDGPU::SReg_LO16RegClass.contains(Reg) ||
298 AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
299 assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
300 "trap handler registers should not be used");
301 IsSGPR = true;
302 Width = 1;
303 } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
304 AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
305 AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
306 IsSGPR = false;
307 Width = 1;
308 } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
309 AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
310 IsSGPR = false;
311 IsAGPR = true;
312 Width = 1;
313 } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
314 assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
315 "trap handler registers should not be used");
316 IsSGPR = true;
317 Width = 2;
318 } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
319 IsSGPR = false;
320 Width = 2;
321 } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
322 IsSGPR = false;
323 IsAGPR = true;
324 Width = 2;
325 } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
326 IsSGPR = false;
327 Width = 3;
328 } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
329 IsSGPR = true;
330 Width = 3;
331 } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
332 IsSGPR = false;
333 IsAGPR = true;
334 Width = 3;
335 } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
336 assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
337 "trap handler registers should not be used");
338 IsSGPR = true;
339 Width = 4;
340 } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
341 IsSGPR = false;
342 Width = 4;
343 } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
344 IsSGPR = false;
345 IsAGPR = true;
346 Width = 4;
347 } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
348 IsSGPR = false;
349 Width = 5;
350 } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
351 IsSGPR = true;
352 Width = 5;
353 } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
354 IsSGPR = false;
355 IsAGPR = true;
356 Width = 5;
357 } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
358 IsSGPR = false;
359 Width = 6;
360 } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
361 IsSGPR = true;
362 Width = 6;
363 } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
364 IsSGPR = false;
365 IsAGPR = true;
366 Width = 6;
367 } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
368 IsSGPR = false;
369 Width = 7;
370 } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
371 IsSGPR = true;
372 Width = 7;
373 } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
374 IsSGPR = false;
375 IsAGPR = true;
376 Width = 7;
377 } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
378 assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
379 "trap handler registers should not be used");
380 IsSGPR = true;
381 Width = 8;
382 } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
383 IsSGPR = false;
384 Width = 8;
385 } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
386 IsSGPR = false;
387 IsAGPR = true;
388 Width = 8;
389 } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
390 assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
391 "trap handler registers should not be used");
392 IsSGPR = true;
393 Width = 16;
394 } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
395 IsSGPR = false;
396 Width = 16;
397 } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
398 IsSGPR = false;
399 IsAGPR = true;
400 Width = 16;
401 } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
402 IsSGPR = true;
403 Width = 32;
404 } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
405 IsSGPR = false;
406 Width = 32;
407 } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
408 IsSGPR = false;
409 IsAGPR = true;
410 Width = 32;
411 } else {
412 llvm_unreachable("Unknown register class");
413 }
414 unsigned HWReg = TRI.getHWRegIndex(Reg);
415 int MaxUsed = HWReg + Width - 1;
416 if (IsSGPR) {
417 MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
418 } else if (IsAGPR) {
419 MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
420 } else {
421 MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
422 }
423 }
424
425 if (MI.isCall()) {
426 // Pseudo used just to encode the underlying global. Is there a better
427 // way to track this?
428
429 const MachineOperand *CalleeOp =
430 TII->getNamedOperand(MI, AMDGPU::OpName::callee);
431
432 const Function *Callee = getCalleeFunction(*CalleeOp);
433 DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
434 CallGraphResourceInfo.end();
435
436 // Avoid crashing on undefined behavior with an illegal call to a
437 // kernel. If a callsite's calling convention doesn't match the
438 // function's, it's undefined behavior. If the callsite calling
439 // convention does match, that would have errored earlier.
440 if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
441 report_fatal_error("invalid call to entry function");
442
443 bool IsIndirect = !Callee || Callee->isDeclaration();
444 if (!IsIndirect)
445 I = CallGraphResourceInfo.find(Callee);
446
447 if (IsIndirect || I == CallGraphResourceInfo.end()) {
448 CalleeFrameSize =
449 std::max(CalleeFrameSize,
450 static_cast<uint64_t>(AssumedStackSizeForExternalCall));
451
452 // Register usage of indirect calls gets handled later
453 Info.UsesVCC = true;
454 Info.UsesFlatScratch = ST.hasFlatAddressSpace();
455 Info.HasDynamicallySizedStack = true;
456 Info.HasIndirectCall = true;
457 } else {
458 // We force CodeGen to run in SCC order, so the callee's register
459 // usage etc. should be the cumulative usage of all callees.
460 MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
461 MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
462 MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
463 CalleeFrameSize =
464 std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
465 Info.UsesVCC |= I->second.UsesVCC;
466 Info.UsesFlatScratch |= I->second.UsesFlatScratch;
467 Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
468 Info.HasRecursion |= I->second.HasRecursion;
469 Info.HasIndirectCall |= I->second.HasIndirectCall;
470 }
471
472 // FIXME: Call site could have norecurse on it
473 if (!Callee || !Callee->doesNotRecurse())
474 Info.HasRecursion = true;
475 }
476 }
477 }
478
479 Info.NumExplicitSGPR = MaxSGPR + 1;
480 Info.NumVGPR = MaxVGPR + 1;
481 Info.NumAGPR = MaxAGPR + 1;
482 Info.PrivateSegmentSize += CalleeFrameSize;
483
484 return Info;
485 }
486
propagateIndirectCallRegisterUsage()487 void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
488 // Collect the maximum number of registers from non-hardware-entrypoints.
489 // All these functions are potential targets for indirect calls.
490 int32_t NonKernelMaxSGPRs = 0;
491 int32_t NonKernelMaxVGPRs = 0;
492 int32_t NonKernelMaxAGPRs = 0;
493
494 for (const auto &I : CallGraphResourceInfo) {
495 if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
496 auto &Info = I.getSecond();
497 NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
498 NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
499 NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
500 }
501 }
502
503 // Add register usage for functions with indirect calls.
504 // For calls to unknown functions, we assume the maximum register usage of
505 // all non-hardware-entrypoints in the current module.
506 for (auto &I : CallGraphResourceInfo) {
507 auto &Info = I.getSecond();
508 if (Info.HasIndirectCall) {
509 Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
510 Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
511 Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
512 }
513 }
514 }
515