173471bf0Spatrick //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
273471bf0Spatrick //
373471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
473471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information.
573471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
673471bf0Spatrick //
773471bf0Spatrick //===----------------------------------------------------------------------===//
873471bf0Spatrick //
973471bf0Spatrick /// \file
1073471bf0Spatrick /// \brief Analyzes how many registers and other resources are used by
1173471bf0Spatrick /// functions.
1273471bf0Spatrick ///
1373471bf0Spatrick /// The results of this analysis are used to fill the register usage, flat
1473471bf0Spatrick /// usage, etc. into hardware registers.
1573471bf0Spatrick ///
1673471bf0Spatrick /// The analysis takes callees into account. E.g. if a function A that needs 10
1773471bf0Spatrick /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
1873471bf0Spatrick /// will return 20.
1973471bf0Spatrick /// It is assumed that an indirect call can go into any function except
2073471bf0Spatrick /// hardware-entrypoints. Therefore the register usage of functions with
2173471bf0Spatrick /// indirect calls is estimated as the maximum of all non-entrypoint functions
2273471bf0Spatrick /// in the module.
2373471bf0Spatrick ///
2473471bf0Spatrick //===----------------------------------------------------------------------===//
2573471bf0Spatrick
2673471bf0Spatrick #include "AMDGPUResourceUsageAnalysis.h"
2773471bf0Spatrick #include "AMDGPU.h"
2873471bf0Spatrick #include "GCNSubtarget.h"
2973471bf0Spatrick #include "SIMachineFunctionInfo.h"
30*d415bd75Srobert #include "llvm/ADT/PostOrderIterator.h"
3173471bf0Spatrick #include "llvm/Analysis/CallGraph.h"
32*d415bd75Srobert #include "llvm/CodeGen/MachineFrameInfo.h"
3373471bf0Spatrick #include "llvm/CodeGen/TargetPassConfig.h"
34*d415bd75Srobert #include "llvm/IR/GlobalAlias.h"
35*d415bd75Srobert #include "llvm/IR/GlobalValue.h"
3673471bf0Spatrick #include "llvm/Target/TargetMachine.h"
3773471bf0Spatrick
3873471bf0Spatrick using namespace llvm;
3973471bf0Spatrick using namespace llvm::AMDGPU;
4073471bf0Spatrick
4173471bf0Spatrick #define DEBUG_TYPE "amdgpu-resource-usage"
4273471bf0Spatrick
4373471bf0Spatrick char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
4473471bf0Spatrick char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
4573471bf0Spatrick
46*d415bd75Srobert // In code object v4 and older, we need to tell the runtime some amount ahead of
47*d415bd75Srobert // time if we don't know the true stack size. Assume a smaller number if this is
48*d415bd75Srobert // only due to dynamic / non-entry block allocas.
4973471bf0Spatrick static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
5073471bf0Spatrick "amdgpu-assume-external-call-stack-size",
5173471bf0Spatrick cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
5273471bf0Spatrick cl::init(16384));
5373471bf0Spatrick
5473471bf0Spatrick static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
5573471bf0Spatrick "amdgpu-assume-dynamic-stack-object-size",
5673471bf0Spatrick cl::desc("Assumed extra stack use if there are any "
5773471bf0Spatrick "variable sized objects (in bytes)"),
5873471bf0Spatrick cl::Hidden, cl::init(4096));
5973471bf0Spatrick
6073471bf0Spatrick INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
6173471bf0Spatrick "Function register usage analysis", true, true)
6273471bf0Spatrick
getCalleeFunction(const MachineOperand & Op)6373471bf0Spatrick static const Function *getCalleeFunction(const MachineOperand &Op) {
6473471bf0Spatrick if (Op.isImm()) {
6573471bf0Spatrick assert(Op.getImm() == 0);
6673471bf0Spatrick return nullptr;
6773471bf0Spatrick }
68*d415bd75Srobert if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69*d415bd75Srobert return cast<Function>(GA->getOperand(0));
7073471bf0Spatrick return cast<Function>(Op.getGlobal());
7173471bf0Spatrick }
7273471bf0Spatrick
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)7373471bf0Spatrick static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
7473471bf0Spatrick const SIInstrInfo &TII, unsigned Reg) {
7573471bf0Spatrick for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
7673471bf0Spatrick if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
7773471bf0Spatrick return true;
7873471bf0Spatrick }
7973471bf0Spatrick
8073471bf0Spatrick return false;
8173471bf0Spatrick }
8273471bf0Spatrick
getTotalNumSGPRs(const GCNSubtarget & ST) const8373471bf0Spatrick int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
8473471bf0Spatrick const GCNSubtarget &ST) const {
8573471bf0Spatrick return NumExplicitSGPR +
8673471bf0Spatrick IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
8773471bf0Spatrick ST.getTargetID().isXnackOnOrAny());
8873471bf0Spatrick }
8973471bf0Spatrick
getTotalNumVGPRs(const GCNSubtarget & ST,int32_t ArgNumAGPR,int32_t ArgNumVGPR) const9073471bf0Spatrick int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91*d415bd75Srobert const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92*d415bd75Srobert return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
9373471bf0Spatrick }
9473471bf0Spatrick
getTotalNumVGPRs(const GCNSubtarget & ST) const95*d415bd75Srobert int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96*d415bd75Srobert const GCNSubtarget &ST) const {
97*d415bd75Srobert return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98*d415bd75Srobert }
99*d415bd75Srobert
runOnModule(Module & M)100*d415bd75Srobert bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
10173471bf0Spatrick auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
10273471bf0Spatrick if (!TPC)
10373471bf0Spatrick return false;
10473471bf0Spatrick
105*d415bd75Srobert MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
10673471bf0Spatrick const TargetMachine &TM = TPC->getTM<TargetMachine>();
10773471bf0Spatrick bool HasIndirectCall = false;
10873471bf0Spatrick
109*d415bd75Srobert CallGraph CG = CallGraph(M);
110*d415bd75Srobert auto End = po_end(&CG);
111*d415bd75Srobert
112*d415bd75Srobert // By default, for code object v5 and later, track only the minimum scratch
113*d415bd75Srobert // size
114*d415bd75Srobert if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
115*d415bd75Srobert if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
116*d415bd75Srobert AssumedStackSizeForDynamicSizeObjects = 0;
117*d415bd75Srobert if (!AssumedStackSizeForExternalCall.getNumOccurrences())
118*d415bd75Srobert AssumedStackSizeForExternalCall = 0;
119*d415bd75Srobert }
120*d415bd75Srobert
121*d415bd75Srobert for (auto IT = po_begin(&CG); IT != End; ++IT) {
122*d415bd75Srobert Function *F = IT->getFunction();
12373471bf0Spatrick if (!F || F->isDeclaration())
12473471bf0Spatrick continue;
12573471bf0Spatrick
126*d415bd75Srobert MachineFunction *MF = MMI.getMachineFunction(*F);
127*d415bd75Srobert assert(MF && "function must have been generated already");
12873471bf0Spatrick
129*d415bd75Srobert auto CI =
130*d415bd75Srobert CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
13173471bf0Spatrick SIFunctionResourceInfo &Info = CI.first->second;
13273471bf0Spatrick assert(CI.second && "should only be called once per function");
133*d415bd75Srobert Info = analyzeResourceUsage(*MF, TM);
134*d415bd75Srobert HasIndirectCall |= Info.HasIndirectCall;
135*d415bd75Srobert }
136*d415bd75Srobert
137*d415bd75Srobert // It's possible we have unreachable functions in the module which weren't
138*d415bd75Srobert // visited by the PO traversal. Make sure we have some resource counts to
139*d415bd75Srobert // report.
140*d415bd75Srobert for (const auto &IT : CG) {
141*d415bd75Srobert const Function *F = IT.first;
142*d415bd75Srobert if (!F || F->isDeclaration())
143*d415bd75Srobert continue;
144*d415bd75Srobert
145*d415bd75Srobert auto CI =
146*d415bd75Srobert CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
147*d415bd75Srobert if (!CI.second) // Skip already visited functions
148*d415bd75Srobert continue;
149*d415bd75Srobert
150*d415bd75Srobert SIFunctionResourceInfo &Info = CI.first->second;
151*d415bd75Srobert MachineFunction *MF = MMI.getMachineFunction(*F);
152*d415bd75Srobert assert(MF && "function must have been generated already");
153*d415bd75Srobert Info = analyzeResourceUsage(*MF, TM);
15473471bf0Spatrick HasIndirectCall |= Info.HasIndirectCall;
15573471bf0Spatrick }
15673471bf0Spatrick
15773471bf0Spatrick if (HasIndirectCall)
15873471bf0Spatrick propagateIndirectCallRegisterUsage();
15973471bf0Spatrick
16073471bf0Spatrick return false;
16173471bf0Spatrick }
16273471bf0Spatrick
16373471bf0Spatrick AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,const TargetMachine & TM) const16473471bf0Spatrick AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
16573471bf0Spatrick const MachineFunction &MF, const TargetMachine &TM) const {
16673471bf0Spatrick SIFunctionResourceInfo Info;
16773471bf0Spatrick
16873471bf0Spatrick const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
16973471bf0Spatrick const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17073471bf0Spatrick const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
17173471bf0Spatrick const MachineRegisterInfo &MRI = MF.getRegInfo();
17273471bf0Spatrick const SIInstrInfo *TII = ST.getInstrInfo();
17373471bf0Spatrick const SIRegisterInfo &TRI = TII->getRegisterInfo();
17473471bf0Spatrick
17573471bf0Spatrick Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
17673471bf0Spatrick MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
17773471bf0Spatrick MRI.isLiveIn(MFI->getPreloadedReg(
17873471bf0Spatrick AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
17973471bf0Spatrick
18073471bf0Spatrick // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
18173471bf0Spatrick // instructions aren't used to access the scratch buffer. Inline assembly may
18273471bf0Spatrick // need it though.
18373471bf0Spatrick //
18473471bf0Spatrick // If we only have implicit uses of flat_scr on flat instructions, it is not
18573471bf0Spatrick // really needed.
18673471bf0Spatrick if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
18773471bf0Spatrick (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
18873471bf0Spatrick !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
18973471bf0Spatrick !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
19073471bf0Spatrick Info.UsesFlatScratch = false;
19173471bf0Spatrick }
19273471bf0Spatrick
19373471bf0Spatrick Info.PrivateSegmentSize = FrameInfo.getStackSize();
19473471bf0Spatrick
19573471bf0Spatrick // Assume a big number if there are any unknown sized objects.
19673471bf0Spatrick Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
19773471bf0Spatrick if (Info.HasDynamicallySizedStack)
19873471bf0Spatrick Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
19973471bf0Spatrick
20073471bf0Spatrick if (MFI->isStackRealigned())
20173471bf0Spatrick Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
20273471bf0Spatrick
20373471bf0Spatrick Info.UsesVCC =
20473471bf0Spatrick MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
20573471bf0Spatrick
20673471bf0Spatrick // If there are no calls, MachineRegisterInfo can tell us the used register
20773471bf0Spatrick // count easily.
20873471bf0Spatrick // A tail call isn't considered a call for MachineFrameInfo's purposes.
20973471bf0Spatrick if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
21073471bf0Spatrick MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
21173471bf0Spatrick for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
21273471bf0Spatrick if (MRI.isPhysRegUsed(Reg)) {
21373471bf0Spatrick HighestVGPRReg = Reg;
21473471bf0Spatrick break;
21573471bf0Spatrick }
21673471bf0Spatrick }
21773471bf0Spatrick
21873471bf0Spatrick if (ST.hasMAIInsts()) {
21973471bf0Spatrick MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
22073471bf0Spatrick for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
22173471bf0Spatrick if (MRI.isPhysRegUsed(Reg)) {
22273471bf0Spatrick HighestAGPRReg = Reg;
22373471bf0Spatrick break;
22473471bf0Spatrick }
22573471bf0Spatrick }
22673471bf0Spatrick Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
22773471bf0Spatrick ? 0
22873471bf0Spatrick : TRI.getHWRegIndex(HighestAGPRReg) + 1;
22973471bf0Spatrick }
23073471bf0Spatrick
23173471bf0Spatrick MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
23273471bf0Spatrick for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
23373471bf0Spatrick if (MRI.isPhysRegUsed(Reg)) {
23473471bf0Spatrick HighestSGPRReg = Reg;
23573471bf0Spatrick break;
23673471bf0Spatrick }
23773471bf0Spatrick }
23873471bf0Spatrick
23973471bf0Spatrick // We found the maximum register index. They start at 0, so add one to get
24073471bf0Spatrick // the number of registers.
24173471bf0Spatrick Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
24273471bf0Spatrick ? 0
24373471bf0Spatrick : TRI.getHWRegIndex(HighestVGPRReg) + 1;
24473471bf0Spatrick Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
24573471bf0Spatrick ? 0
24673471bf0Spatrick : TRI.getHWRegIndex(HighestSGPRReg) + 1;
24773471bf0Spatrick
24873471bf0Spatrick return Info;
24973471bf0Spatrick }
25073471bf0Spatrick
25173471bf0Spatrick int32_t MaxVGPR = -1;
25273471bf0Spatrick int32_t MaxAGPR = -1;
25373471bf0Spatrick int32_t MaxSGPR = -1;
25473471bf0Spatrick uint64_t CalleeFrameSize = 0;
25573471bf0Spatrick
25673471bf0Spatrick for (const MachineBasicBlock &MBB : MF) {
25773471bf0Spatrick for (const MachineInstr &MI : MBB) {
25873471bf0Spatrick // TODO: Check regmasks? Do they occur anywhere except calls?
25973471bf0Spatrick for (const MachineOperand &MO : MI.operands()) {
26073471bf0Spatrick unsigned Width = 0;
26173471bf0Spatrick bool IsSGPR = false;
26273471bf0Spatrick bool IsAGPR = false;
26373471bf0Spatrick
26473471bf0Spatrick if (!MO.isReg())
26573471bf0Spatrick continue;
26673471bf0Spatrick
26773471bf0Spatrick Register Reg = MO.getReg();
26873471bf0Spatrick switch (Reg) {
26973471bf0Spatrick case AMDGPU::EXEC:
27073471bf0Spatrick case AMDGPU::EXEC_LO:
27173471bf0Spatrick case AMDGPU::EXEC_HI:
27273471bf0Spatrick case AMDGPU::SCC:
27373471bf0Spatrick case AMDGPU::M0:
27473471bf0Spatrick case AMDGPU::M0_LO16:
27573471bf0Spatrick case AMDGPU::M0_HI16:
276*d415bd75Srobert case AMDGPU::SRC_SHARED_BASE_LO:
27773471bf0Spatrick case AMDGPU::SRC_SHARED_BASE:
278*d415bd75Srobert case AMDGPU::SRC_SHARED_LIMIT_LO:
27973471bf0Spatrick case AMDGPU::SRC_SHARED_LIMIT:
280*d415bd75Srobert case AMDGPU::SRC_PRIVATE_BASE_LO:
28173471bf0Spatrick case AMDGPU::SRC_PRIVATE_BASE:
282*d415bd75Srobert case AMDGPU::SRC_PRIVATE_LIMIT_LO:
28373471bf0Spatrick case AMDGPU::SRC_PRIVATE_LIMIT:
28473471bf0Spatrick case AMDGPU::SGPR_NULL:
285*d415bd75Srobert case AMDGPU::SGPR_NULL64:
28673471bf0Spatrick case AMDGPU::MODE:
28773471bf0Spatrick continue;
28873471bf0Spatrick
28973471bf0Spatrick case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
29073471bf0Spatrick llvm_unreachable("src_pops_exiting_wave_id should not be used");
29173471bf0Spatrick
29273471bf0Spatrick case AMDGPU::NoRegister:
29373471bf0Spatrick assert(MI.isDebugInstr() &&
29473471bf0Spatrick "Instruction uses invalid noreg register");
29573471bf0Spatrick continue;
29673471bf0Spatrick
29773471bf0Spatrick case AMDGPU::VCC:
29873471bf0Spatrick case AMDGPU::VCC_LO:
29973471bf0Spatrick case AMDGPU::VCC_HI:
30073471bf0Spatrick case AMDGPU::VCC_LO_LO16:
30173471bf0Spatrick case AMDGPU::VCC_LO_HI16:
30273471bf0Spatrick case AMDGPU::VCC_HI_LO16:
30373471bf0Spatrick case AMDGPU::VCC_HI_HI16:
30473471bf0Spatrick Info.UsesVCC = true;
30573471bf0Spatrick continue;
30673471bf0Spatrick
30773471bf0Spatrick case AMDGPU::FLAT_SCR:
30873471bf0Spatrick case AMDGPU::FLAT_SCR_LO:
30973471bf0Spatrick case AMDGPU::FLAT_SCR_HI:
31073471bf0Spatrick continue;
31173471bf0Spatrick
31273471bf0Spatrick case AMDGPU::XNACK_MASK:
31373471bf0Spatrick case AMDGPU::XNACK_MASK_LO:
31473471bf0Spatrick case AMDGPU::XNACK_MASK_HI:
31573471bf0Spatrick llvm_unreachable("xnack_mask registers should not be used");
31673471bf0Spatrick
31773471bf0Spatrick case AMDGPU::LDS_DIRECT:
31873471bf0Spatrick llvm_unreachable("lds_direct register should not be used");
31973471bf0Spatrick
32073471bf0Spatrick case AMDGPU::TBA:
32173471bf0Spatrick case AMDGPU::TBA_LO:
32273471bf0Spatrick case AMDGPU::TBA_HI:
32373471bf0Spatrick case AMDGPU::TMA:
32473471bf0Spatrick case AMDGPU::TMA_LO:
32573471bf0Spatrick case AMDGPU::TMA_HI:
32673471bf0Spatrick llvm_unreachable("trap handler registers should not be used");
32773471bf0Spatrick
32873471bf0Spatrick case AMDGPU::SRC_VCCZ:
32973471bf0Spatrick llvm_unreachable("src_vccz register should not be used");
33073471bf0Spatrick
33173471bf0Spatrick case AMDGPU::SRC_EXECZ:
33273471bf0Spatrick llvm_unreachable("src_execz register should not be used");
33373471bf0Spatrick
33473471bf0Spatrick case AMDGPU::SRC_SCC:
33573471bf0Spatrick llvm_unreachable("src_scc register should not be used");
33673471bf0Spatrick
33773471bf0Spatrick default:
33873471bf0Spatrick break;
33973471bf0Spatrick }
34073471bf0Spatrick
34173471bf0Spatrick if (AMDGPU::SReg_32RegClass.contains(Reg) ||
34273471bf0Spatrick AMDGPU::SReg_LO16RegClass.contains(Reg) ||
34373471bf0Spatrick AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
34473471bf0Spatrick assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
34573471bf0Spatrick "trap handler registers should not be used");
34673471bf0Spatrick IsSGPR = true;
34773471bf0Spatrick Width = 1;
34873471bf0Spatrick } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
34973471bf0Spatrick AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
35073471bf0Spatrick AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
35173471bf0Spatrick IsSGPR = false;
35273471bf0Spatrick Width = 1;
35373471bf0Spatrick } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
35473471bf0Spatrick AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
35573471bf0Spatrick IsSGPR = false;
35673471bf0Spatrick IsAGPR = true;
35773471bf0Spatrick Width = 1;
35873471bf0Spatrick } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
35973471bf0Spatrick assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
36073471bf0Spatrick "trap handler registers should not be used");
36173471bf0Spatrick IsSGPR = true;
36273471bf0Spatrick Width = 2;
36373471bf0Spatrick } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
36473471bf0Spatrick IsSGPR = false;
36573471bf0Spatrick Width = 2;
36673471bf0Spatrick } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
36773471bf0Spatrick IsSGPR = false;
36873471bf0Spatrick IsAGPR = true;
36973471bf0Spatrick Width = 2;
37073471bf0Spatrick } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
37173471bf0Spatrick IsSGPR = false;
37273471bf0Spatrick Width = 3;
37373471bf0Spatrick } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
37473471bf0Spatrick IsSGPR = true;
37573471bf0Spatrick Width = 3;
37673471bf0Spatrick } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
37773471bf0Spatrick IsSGPR = false;
37873471bf0Spatrick IsAGPR = true;
37973471bf0Spatrick Width = 3;
38073471bf0Spatrick } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
38173471bf0Spatrick assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
38273471bf0Spatrick "trap handler registers should not be used");
38373471bf0Spatrick IsSGPR = true;
38473471bf0Spatrick Width = 4;
38573471bf0Spatrick } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
38673471bf0Spatrick IsSGPR = false;
38773471bf0Spatrick Width = 4;
38873471bf0Spatrick } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
38973471bf0Spatrick IsSGPR = false;
39073471bf0Spatrick IsAGPR = true;
39173471bf0Spatrick Width = 4;
39273471bf0Spatrick } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
39373471bf0Spatrick IsSGPR = false;
39473471bf0Spatrick Width = 5;
39573471bf0Spatrick } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
39673471bf0Spatrick IsSGPR = true;
39773471bf0Spatrick Width = 5;
39873471bf0Spatrick } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
39973471bf0Spatrick IsSGPR = false;
40073471bf0Spatrick IsAGPR = true;
40173471bf0Spatrick Width = 5;
40273471bf0Spatrick } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
40373471bf0Spatrick IsSGPR = false;
40473471bf0Spatrick Width = 6;
40573471bf0Spatrick } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
40673471bf0Spatrick IsSGPR = true;
40773471bf0Spatrick Width = 6;
40873471bf0Spatrick } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
40973471bf0Spatrick IsSGPR = false;
41073471bf0Spatrick IsAGPR = true;
41173471bf0Spatrick Width = 6;
41273471bf0Spatrick } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
41373471bf0Spatrick IsSGPR = false;
41473471bf0Spatrick Width = 7;
41573471bf0Spatrick } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
41673471bf0Spatrick IsSGPR = true;
41773471bf0Spatrick Width = 7;
41873471bf0Spatrick } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
41973471bf0Spatrick IsSGPR = false;
42073471bf0Spatrick IsAGPR = true;
42173471bf0Spatrick Width = 7;
42273471bf0Spatrick } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
42373471bf0Spatrick assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
42473471bf0Spatrick "trap handler registers should not be used");
42573471bf0Spatrick IsSGPR = true;
42673471bf0Spatrick Width = 8;
42773471bf0Spatrick } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
42873471bf0Spatrick IsSGPR = false;
42973471bf0Spatrick Width = 8;
43073471bf0Spatrick } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
43173471bf0Spatrick IsSGPR = false;
43273471bf0Spatrick IsAGPR = true;
43373471bf0Spatrick Width = 8;
434*d415bd75Srobert } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
435*d415bd75Srobert IsSGPR = false;
436*d415bd75Srobert Width = 9;
437*d415bd75Srobert } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
438*d415bd75Srobert IsSGPR = true;
439*d415bd75Srobert Width = 9;
440*d415bd75Srobert } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
441*d415bd75Srobert IsSGPR = false;
442*d415bd75Srobert IsAGPR = true;
443*d415bd75Srobert Width = 9;
444*d415bd75Srobert } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
445*d415bd75Srobert IsSGPR = false;
446*d415bd75Srobert Width = 10;
447*d415bd75Srobert } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
448*d415bd75Srobert IsSGPR = true;
449*d415bd75Srobert Width = 10;
450*d415bd75Srobert } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
451*d415bd75Srobert IsSGPR = false;
452*d415bd75Srobert IsAGPR = true;
453*d415bd75Srobert Width = 10;
454*d415bd75Srobert } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
455*d415bd75Srobert IsSGPR = false;
456*d415bd75Srobert Width = 11;
457*d415bd75Srobert } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
458*d415bd75Srobert IsSGPR = true;
459*d415bd75Srobert Width = 11;
460*d415bd75Srobert } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
461*d415bd75Srobert IsSGPR = false;
462*d415bd75Srobert IsAGPR = true;
463*d415bd75Srobert Width = 11;
464*d415bd75Srobert } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
465*d415bd75Srobert IsSGPR = false;
466*d415bd75Srobert Width = 12;
467*d415bd75Srobert } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
468*d415bd75Srobert IsSGPR = true;
469*d415bd75Srobert Width = 12;
470*d415bd75Srobert } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
471*d415bd75Srobert IsSGPR = false;
472*d415bd75Srobert IsAGPR = true;
473*d415bd75Srobert Width = 12;
47473471bf0Spatrick } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
47573471bf0Spatrick assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
47673471bf0Spatrick "trap handler registers should not be used");
47773471bf0Spatrick IsSGPR = true;
47873471bf0Spatrick Width = 16;
47973471bf0Spatrick } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
48073471bf0Spatrick IsSGPR = false;
48173471bf0Spatrick Width = 16;
48273471bf0Spatrick } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
48373471bf0Spatrick IsSGPR = false;
48473471bf0Spatrick IsAGPR = true;
48573471bf0Spatrick Width = 16;
48673471bf0Spatrick } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
48773471bf0Spatrick IsSGPR = true;
48873471bf0Spatrick Width = 32;
48973471bf0Spatrick } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
49073471bf0Spatrick IsSGPR = false;
49173471bf0Spatrick Width = 32;
49273471bf0Spatrick } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
49373471bf0Spatrick IsSGPR = false;
49473471bf0Spatrick IsAGPR = true;
49573471bf0Spatrick Width = 32;
49673471bf0Spatrick } else {
49773471bf0Spatrick llvm_unreachable("Unknown register class");
49873471bf0Spatrick }
49973471bf0Spatrick unsigned HWReg = TRI.getHWRegIndex(Reg);
50073471bf0Spatrick int MaxUsed = HWReg + Width - 1;
50173471bf0Spatrick if (IsSGPR) {
50273471bf0Spatrick MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
50373471bf0Spatrick } else if (IsAGPR) {
50473471bf0Spatrick MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
50573471bf0Spatrick } else {
50673471bf0Spatrick MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
50773471bf0Spatrick }
50873471bf0Spatrick }
50973471bf0Spatrick
51073471bf0Spatrick if (MI.isCall()) {
51173471bf0Spatrick // Pseudo used just to encode the underlying global. Is there a better
51273471bf0Spatrick // way to track this?
51373471bf0Spatrick
51473471bf0Spatrick const MachineOperand *CalleeOp =
51573471bf0Spatrick TII->getNamedOperand(MI, AMDGPU::OpName::callee);
51673471bf0Spatrick
51773471bf0Spatrick const Function *Callee = getCalleeFunction(*CalleeOp);
51873471bf0Spatrick DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
51973471bf0Spatrick CallGraphResourceInfo.end();
52073471bf0Spatrick
52173471bf0Spatrick // Avoid crashing on undefined behavior with an illegal call to a
52273471bf0Spatrick // kernel. If a callsite's calling convention doesn't match the
52373471bf0Spatrick // function's, it's undefined behavior. If the callsite calling
52473471bf0Spatrick // convention does match, that would have errored earlier.
52573471bf0Spatrick if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
52673471bf0Spatrick report_fatal_error("invalid call to entry function");
52773471bf0Spatrick
52873471bf0Spatrick bool IsIndirect = !Callee || Callee->isDeclaration();
52973471bf0Spatrick if (!IsIndirect)
53073471bf0Spatrick I = CallGraphResourceInfo.find(Callee);
53173471bf0Spatrick
532*d415bd75Srobert // FIXME: Call site could have norecurse on it
533*d415bd75Srobert if (!Callee || !Callee->doesNotRecurse()) {
534*d415bd75Srobert Info.HasRecursion = true;
535*d415bd75Srobert
536*d415bd75Srobert // TODO: If we happen to know there is no stack usage in the
537*d415bd75Srobert // callgraph, we don't need to assume an infinitely growing stack.
538*d415bd75Srobert if (!MI.isReturn()) {
539*d415bd75Srobert // We don't need to assume an unknown stack size for tail calls.
540*d415bd75Srobert
541*d415bd75Srobert // FIXME: This only benefits in the case where the kernel does not
542*d415bd75Srobert // directly call the tail called function. If a kernel directly
543*d415bd75Srobert // calls a tail recursive function, we'll assume maximum stack size
544*d415bd75Srobert // based on the regular call instruction.
545*d415bd75Srobert CalleeFrameSize =
546*d415bd75Srobert std::max(CalleeFrameSize,
547*d415bd75Srobert static_cast<uint64_t>(AssumedStackSizeForExternalCall));
548*d415bd75Srobert }
549*d415bd75Srobert }
550*d415bd75Srobert
55173471bf0Spatrick if (IsIndirect || I == CallGraphResourceInfo.end()) {
55273471bf0Spatrick CalleeFrameSize =
55373471bf0Spatrick std::max(CalleeFrameSize,
55473471bf0Spatrick static_cast<uint64_t>(AssumedStackSizeForExternalCall));
55573471bf0Spatrick
55673471bf0Spatrick // Register usage of indirect calls gets handled later
55773471bf0Spatrick Info.UsesVCC = true;
55873471bf0Spatrick Info.UsesFlatScratch = ST.hasFlatAddressSpace();
55973471bf0Spatrick Info.HasDynamicallySizedStack = true;
56073471bf0Spatrick Info.HasIndirectCall = true;
56173471bf0Spatrick } else {
56273471bf0Spatrick // We force CodeGen to run in SCC order, so the callee's register
56373471bf0Spatrick // usage etc. should be the cumulative usage of all callees.
56473471bf0Spatrick MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
56573471bf0Spatrick MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
56673471bf0Spatrick MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
56773471bf0Spatrick CalleeFrameSize =
56873471bf0Spatrick std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
56973471bf0Spatrick Info.UsesVCC |= I->second.UsesVCC;
57073471bf0Spatrick Info.UsesFlatScratch |= I->second.UsesFlatScratch;
57173471bf0Spatrick Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
57273471bf0Spatrick Info.HasRecursion |= I->second.HasRecursion;
57373471bf0Spatrick Info.HasIndirectCall |= I->second.HasIndirectCall;
57473471bf0Spatrick }
57573471bf0Spatrick }
57673471bf0Spatrick }
57773471bf0Spatrick }
57873471bf0Spatrick
57973471bf0Spatrick Info.NumExplicitSGPR = MaxSGPR + 1;
58073471bf0Spatrick Info.NumVGPR = MaxVGPR + 1;
58173471bf0Spatrick Info.NumAGPR = MaxAGPR + 1;
58273471bf0Spatrick Info.PrivateSegmentSize += CalleeFrameSize;
58373471bf0Spatrick
58473471bf0Spatrick return Info;
58573471bf0Spatrick }
58673471bf0Spatrick
propagateIndirectCallRegisterUsage()58773471bf0Spatrick void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
58873471bf0Spatrick // Collect the maximum number of registers from non-hardware-entrypoints.
58973471bf0Spatrick // All these functions are potential targets for indirect calls.
59073471bf0Spatrick int32_t NonKernelMaxSGPRs = 0;
59173471bf0Spatrick int32_t NonKernelMaxVGPRs = 0;
59273471bf0Spatrick int32_t NonKernelMaxAGPRs = 0;
59373471bf0Spatrick
59473471bf0Spatrick for (const auto &I : CallGraphResourceInfo) {
59573471bf0Spatrick if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
59673471bf0Spatrick auto &Info = I.getSecond();
59773471bf0Spatrick NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
59873471bf0Spatrick NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
59973471bf0Spatrick NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
60073471bf0Spatrick }
60173471bf0Spatrick }
60273471bf0Spatrick
60373471bf0Spatrick // Add register usage for functions with indirect calls.
60473471bf0Spatrick // For calls to unknown functions, we assume the maximum register usage of
60573471bf0Spatrick // all non-hardware-entrypoints in the current module.
60673471bf0Spatrick for (auto &I : CallGraphResourceInfo) {
60773471bf0Spatrick auto &Info = I.getSecond();
60873471bf0Spatrick if (Info.HasIndirectCall) {
60973471bf0Spatrick Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
61073471bf0Spatrick Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
61173471bf0Spatrick Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
61273471bf0Spatrick }
61373471bf0Spatrick }
61473471bf0Spatrick }
615