173471bf0Spatrick //===- AMDGPUResourceUsageAnalysis.h ---- analysis of resources -----------===//
273471bf0Spatrick //
373471bf0Spatrick // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
473471bf0Spatrick // See https://llvm.org/LICENSE.txt for license information.
573471bf0Spatrick // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
673471bf0Spatrick //
773471bf0Spatrick //===----------------------------------------------------------------------===//
873471bf0Spatrick //
973471bf0Spatrick /// \file
1073471bf0Spatrick /// \brief Analyzes how many registers and other resources are used by
1173471bf0Spatrick /// functions.
1273471bf0Spatrick ///
1373471bf0Spatrick /// The results of this analysis are used to fill the register usage, flat
1473471bf0Spatrick /// usage, etc. into hardware registers.
1573471bf0Spatrick ///
1673471bf0Spatrick /// The analysis takes callees into account. E.g. if a function A that needs 10
1773471bf0Spatrick /// VGPRs calls a function B that needs 20 VGPRs, querying the VGPR usage of A
1873471bf0Spatrick /// will return 20.
1973471bf0Spatrick /// It is assumed that an indirect call can go into any function except
2073471bf0Spatrick /// hardware-entrypoints. Therefore the register usage of functions with
2173471bf0Spatrick /// indirect calls is estimated as the maximum of all non-entrypoint functions
2273471bf0Spatrick /// in the module.
2373471bf0Spatrick ///
2473471bf0Spatrick //===----------------------------------------------------------------------===//
2573471bf0Spatrick 
2673471bf0Spatrick #include "AMDGPUResourceUsageAnalysis.h"
2773471bf0Spatrick #include "AMDGPU.h"
2873471bf0Spatrick #include "GCNSubtarget.h"
2973471bf0Spatrick #include "SIMachineFunctionInfo.h"
30*d415bd75Srobert #include "llvm/ADT/PostOrderIterator.h"
3173471bf0Spatrick #include "llvm/Analysis/CallGraph.h"
32*d415bd75Srobert #include "llvm/CodeGen/MachineFrameInfo.h"
3373471bf0Spatrick #include "llvm/CodeGen/TargetPassConfig.h"
34*d415bd75Srobert #include "llvm/IR/GlobalAlias.h"
35*d415bd75Srobert #include "llvm/IR/GlobalValue.h"
3673471bf0Spatrick #include "llvm/Target/TargetMachine.h"
3773471bf0Spatrick 
3873471bf0Spatrick using namespace llvm;
3973471bf0Spatrick using namespace llvm::AMDGPU;
4073471bf0Spatrick 
4173471bf0Spatrick #define DEBUG_TYPE "amdgpu-resource-usage"
4273471bf0Spatrick 
4373471bf0Spatrick char llvm::AMDGPUResourceUsageAnalysis::ID = 0;
4473471bf0Spatrick char &llvm::AMDGPUResourceUsageAnalysisID = AMDGPUResourceUsageAnalysis::ID;
4573471bf0Spatrick 
46*d415bd75Srobert // In code object v4 and older, we need to tell the runtime some amount ahead of
47*d415bd75Srobert // time if we don't know the true stack size. Assume a smaller number if this is
48*d415bd75Srobert // only due to dynamic / non-entry block allocas.
4973471bf0Spatrick static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
5073471bf0Spatrick     "amdgpu-assume-external-call-stack-size",
5173471bf0Spatrick     cl::desc("Assumed stack use of any external call (in bytes)"), cl::Hidden,
5273471bf0Spatrick     cl::init(16384));
5373471bf0Spatrick 
5473471bf0Spatrick static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
5573471bf0Spatrick     "amdgpu-assume-dynamic-stack-object-size",
5673471bf0Spatrick     cl::desc("Assumed extra stack use if there are any "
5773471bf0Spatrick              "variable sized objects (in bytes)"),
5873471bf0Spatrick     cl::Hidden, cl::init(4096));
5973471bf0Spatrick 
6073471bf0Spatrick INITIALIZE_PASS(AMDGPUResourceUsageAnalysis, DEBUG_TYPE,
6173471bf0Spatrick                 "Function register usage analysis", true, true)
6273471bf0Spatrick 
getCalleeFunction(const MachineOperand & Op)6373471bf0Spatrick static const Function *getCalleeFunction(const MachineOperand &Op) {
6473471bf0Spatrick   if (Op.isImm()) {
6573471bf0Spatrick     assert(Op.getImm() == 0);
6673471bf0Spatrick     return nullptr;
6773471bf0Spatrick   }
68*d415bd75Srobert   if (auto *GA = dyn_cast<GlobalAlias>(Op.getGlobal()))
69*d415bd75Srobert     return cast<Function>(GA->getOperand(0));
7073471bf0Spatrick   return cast<Function>(Op.getGlobal());
7173471bf0Spatrick }
7273471bf0Spatrick 
hasAnyNonFlatUseOfReg(const MachineRegisterInfo & MRI,const SIInstrInfo & TII,unsigned Reg)7373471bf0Spatrick static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
7473471bf0Spatrick                                   const SIInstrInfo &TII, unsigned Reg) {
7573471bf0Spatrick   for (const MachineOperand &UseOp : MRI.reg_operands(Reg)) {
7673471bf0Spatrick     if (!UseOp.isImplicit() || !TII.isFLAT(*UseOp.getParent()))
7773471bf0Spatrick       return true;
7873471bf0Spatrick   }
7973471bf0Spatrick 
8073471bf0Spatrick   return false;
8173471bf0Spatrick }
8273471bf0Spatrick 
getTotalNumSGPRs(const GCNSubtarget & ST) const8373471bf0Spatrick int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumSGPRs(
8473471bf0Spatrick     const GCNSubtarget &ST) const {
8573471bf0Spatrick   return NumExplicitSGPR +
8673471bf0Spatrick          IsaInfo::getNumExtraSGPRs(&ST, UsesVCC, UsesFlatScratch,
8773471bf0Spatrick                                    ST.getTargetID().isXnackOnOrAny());
8873471bf0Spatrick }
8973471bf0Spatrick 
getTotalNumVGPRs(const GCNSubtarget & ST,int32_t ArgNumAGPR,int32_t ArgNumVGPR) const9073471bf0Spatrick int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
91*d415bd75Srobert     const GCNSubtarget &ST, int32_t ArgNumAGPR, int32_t ArgNumVGPR) const {
92*d415bd75Srobert   return AMDGPU::getTotalNumVGPRs(ST.hasGFX90AInsts(), ArgNumAGPR, ArgNumVGPR);
9373471bf0Spatrick }
9473471bf0Spatrick 
getTotalNumVGPRs(const GCNSubtarget & ST) const95*d415bd75Srobert int32_t AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo::getTotalNumVGPRs(
96*d415bd75Srobert     const GCNSubtarget &ST) const {
97*d415bd75Srobert   return getTotalNumVGPRs(ST, NumAGPR, NumVGPR);
98*d415bd75Srobert }
99*d415bd75Srobert 
runOnModule(Module & M)100*d415bd75Srobert bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
10173471bf0Spatrick   auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
10273471bf0Spatrick   if (!TPC)
10373471bf0Spatrick     return false;
10473471bf0Spatrick 
105*d415bd75Srobert   MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
10673471bf0Spatrick   const TargetMachine &TM = TPC->getTM<TargetMachine>();
10773471bf0Spatrick   bool HasIndirectCall = false;
10873471bf0Spatrick 
109*d415bd75Srobert   CallGraph CG = CallGraph(M);
110*d415bd75Srobert   auto End = po_end(&CG);
111*d415bd75Srobert 
112*d415bd75Srobert   // By default, for code object v5 and later, track only the minimum scratch
113*d415bd75Srobert   // size
114*d415bd75Srobert   if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
115*d415bd75Srobert     if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
116*d415bd75Srobert       AssumedStackSizeForDynamicSizeObjects = 0;
117*d415bd75Srobert     if (!AssumedStackSizeForExternalCall.getNumOccurrences())
118*d415bd75Srobert       AssumedStackSizeForExternalCall = 0;
119*d415bd75Srobert   }
120*d415bd75Srobert 
121*d415bd75Srobert   for (auto IT = po_begin(&CG); IT != End; ++IT) {
122*d415bd75Srobert     Function *F = IT->getFunction();
12373471bf0Spatrick     if (!F || F->isDeclaration())
12473471bf0Spatrick       continue;
12573471bf0Spatrick 
126*d415bd75Srobert     MachineFunction *MF = MMI.getMachineFunction(*F);
127*d415bd75Srobert     assert(MF && "function must have been generated already");
12873471bf0Spatrick 
129*d415bd75Srobert     auto CI =
130*d415bd75Srobert         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
13173471bf0Spatrick     SIFunctionResourceInfo &Info = CI.first->second;
13273471bf0Spatrick     assert(CI.second && "should only be called once per function");
133*d415bd75Srobert     Info = analyzeResourceUsage(*MF, TM);
134*d415bd75Srobert     HasIndirectCall |= Info.HasIndirectCall;
135*d415bd75Srobert   }
136*d415bd75Srobert 
137*d415bd75Srobert   // It's possible we have unreachable functions in the module which weren't
138*d415bd75Srobert   // visited by the PO traversal. Make sure we have some resource counts to
139*d415bd75Srobert   // report.
140*d415bd75Srobert   for (const auto &IT : CG) {
141*d415bd75Srobert     const Function *F = IT.first;
142*d415bd75Srobert     if (!F || F->isDeclaration())
143*d415bd75Srobert       continue;
144*d415bd75Srobert 
145*d415bd75Srobert     auto CI =
146*d415bd75Srobert         CallGraphResourceInfo.insert(std::pair(F, SIFunctionResourceInfo()));
147*d415bd75Srobert     if (!CI.second) // Skip already visited functions
148*d415bd75Srobert       continue;
149*d415bd75Srobert 
150*d415bd75Srobert     SIFunctionResourceInfo &Info = CI.first->second;
151*d415bd75Srobert     MachineFunction *MF = MMI.getMachineFunction(*F);
152*d415bd75Srobert     assert(MF && "function must have been generated already");
153*d415bd75Srobert     Info = analyzeResourceUsage(*MF, TM);
15473471bf0Spatrick     HasIndirectCall |= Info.HasIndirectCall;
15573471bf0Spatrick   }
15673471bf0Spatrick 
15773471bf0Spatrick   if (HasIndirectCall)
15873471bf0Spatrick     propagateIndirectCallRegisterUsage();
15973471bf0Spatrick 
16073471bf0Spatrick   return false;
16173471bf0Spatrick }
16273471bf0Spatrick 
16373471bf0Spatrick AMDGPUResourceUsageAnalysis::SIFunctionResourceInfo
analyzeResourceUsage(const MachineFunction & MF,const TargetMachine & TM) const16473471bf0Spatrick AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
16573471bf0Spatrick     const MachineFunction &MF, const TargetMachine &TM) const {
16673471bf0Spatrick   SIFunctionResourceInfo Info;
16773471bf0Spatrick 
16873471bf0Spatrick   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
16973471bf0Spatrick   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
17073471bf0Spatrick   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
17173471bf0Spatrick   const MachineRegisterInfo &MRI = MF.getRegInfo();
17273471bf0Spatrick   const SIInstrInfo *TII = ST.getInstrInfo();
17373471bf0Spatrick   const SIRegisterInfo &TRI = TII->getRegisterInfo();
17473471bf0Spatrick 
17573471bf0Spatrick   Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
17673471bf0Spatrick                          MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI) ||
17773471bf0Spatrick                          MRI.isLiveIn(MFI->getPreloadedReg(
17873471bf0Spatrick                              AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT));
17973471bf0Spatrick 
18073471bf0Spatrick   // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
18173471bf0Spatrick   // instructions aren't used to access the scratch buffer. Inline assembly may
18273471bf0Spatrick   // need it though.
18373471bf0Spatrick   //
18473471bf0Spatrick   // If we only have implicit uses of flat_scr on flat instructions, it is not
18573471bf0Spatrick   // really needed.
18673471bf0Spatrick   if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
18773471bf0Spatrick       (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
18873471bf0Spatrick        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
18973471bf0Spatrick        !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
19073471bf0Spatrick     Info.UsesFlatScratch = false;
19173471bf0Spatrick   }
19273471bf0Spatrick 
19373471bf0Spatrick   Info.PrivateSegmentSize = FrameInfo.getStackSize();
19473471bf0Spatrick 
19573471bf0Spatrick   // Assume a big number if there are any unknown sized objects.
19673471bf0Spatrick   Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
19773471bf0Spatrick   if (Info.HasDynamicallySizedStack)
19873471bf0Spatrick     Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
19973471bf0Spatrick 
20073471bf0Spatrick   if (MFI->isStackRealigned())
20173471bf0Spatrick     Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
20273471bf0Spatrick 
20373471bf0Spatrick   Info.UsesVCC =
20473471bf0Spatrick       MRI.isPhysRegUsed(AMDGPU::VCC_LO) || MRI.isPhysRegUsed(AMDGPU::VCC_HI);
20573471bf0Spatrick 
20673471bf0Spatrick   // If there are no calls, MachineRegisterInfo can tell us the used register
20773471bf0Spatrick   // count easily.
20873471bf0Spatrick   // A tail call isn't considered a call for MachineFrameInfo's purposes.
20973471bf0Spatrick   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
21073471bf0Spatrick     MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
21173471bf0Spatrick     for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
21273471bf0Spatrick       if (MRI.isPhysRegUsed(Reg)) {
21373471bf0Spatrick         HighestVGPRReg = Reg;
21473471bf0Spatrick         break;
21573471bf0Spatrick       }
21673471bf0Spatrick     }
21773471bf0Spatrick 
21873471bf0Spatrick     if (ST.hasMAIInsts()) {
21973471bf0Spatrick       MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
22073471bf0Spatrick       for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
22173471bf0Spatrick         if (MRI.isPhysRegUsed(Reg)) {
22273471bf0Spatrick           HighestAGPRReg = Reg;
22373471bf0Spatrick           break;
22473471bf0Spatrick         }
22573471bf0Spatrick       }
22673471bf0Spatrick       Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
22773471bf0Spatrick                          ? 0
22873471bf0Spatrick                          : TRI.getHWRegIndex(HighestAGPRReg) + 1;
22973471bf0Spatrick     }
23073471bf0Spatrick 
23173471bf0Spatrick     MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
23273471bf0Spatrick     for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
23373471bf0Spatrick       if (MRI.isPhysRegUsed(Reg)) {
23473471bf0Spatrick         HighestSGPRReg = Reg;
23573471bf0Spatrick         break;
23673471bf0Spatrick       }
23773471bf0Spatrick     }
23873471bf0Spatrick 
23973471bf0Spatrick     // We found the maximum register index. They start at 0, so add one to get
24073471bf0Spatrick     // the number of registers.
24173471bf0Spatrick     Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
24273471bf0Spatrick                        ? 0
24373471bf0Spatrick                        : TRI.getHWRegIndex(HighestVGPRReg) + 1;
24473471bf0Spatrick     Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
24573471bf0Spatrick                                ? 0
24673471bf0Spatrick                                : TRI.getHWRegIndex(HighestSGPRReg) + 1;
24773471bf0Spatrick 
24873471bf0Spatrick     return Info;
24973471bf0Spatrick   }
25073471bf0Spatrick 
25173471bf0Spatrick   int32_t MaxVGPR = -1;
25273471bf0Spatrick   int32_t MaxAGPR = -1;
25373471bf0Spatrick   int32_t MaxSGPR = -1;
25473471bf0Spatrick   uint64_t CalleeFrameSize = 0;
25573471bf0Spatrick 
25673471bf0Spatrick   for (const MachineBasicBlock &MBB : MF) {
25773471bf0Spatrick     for (const MachineInstr &MI : MBB) {
25873471bf0Spatrick       // TODO: Check regmasks? Do they occur anywhere except calls?
25973471bf0Spatrick       for (const MachineOperand &MO : MI.operands()) {
26073471bf0Spatrick         unsigned Width = 0;
26173471bf0Spatrick         bool IsSGPR = false;
26273471bf0Spatrick         bool IsAGPR = false;
26373471bf0Spatrick 
26473471bf0Spatrick         if (!MO.isReg())
26573471bf0Spatrick           continue;
26673471bf0Spatrick 
26773471bf0Spatrick         Register Reg = MO.getReg();
26873471bf0Spatrick         switch (Reg) {
26973471bf0Spatrick         case AMDGPU::EXEC:
27073471bf0Spatrick         case AMDGPU::EXEC_LO:
27173471bf0Spatrick         case AMDGPU::EXEC_HI:
27273471bf0Spatrick         case AMDGPU::SCC:
27373471bf0Spatrick         case AMDGPU::M0:
27473471bf0Spatrick         case AMDGPU::M0_LO16:
27573471bf0Spatrick         case AMDGPU::M0_HI16:
276*d415bd75Srobert         case AMDGPU::SRC_SHARED_BASE_LO:
27773471bf0Spatrick         case AMDGPU::SRC_SHARED_BASE:
278*d415bd75Srobert         case AMDGPU::SRC_SHARED_LIMIT_LO:
27973471bf0Spatrick         case AMDGPU::SRC_SHARED_LIMIT:
280*d415bd75Srobert         case AMDGPU::SRC_PRIVATE_BASE_LO:
28173471bf0Spatrick         case AMDGPU::SRC_PRIVATE_BASE:
282*d415bd75Srobert         case AMDGPU::SRC_PRIVATE_LIMIT_LO:
28373471bf0Spatrick         case AMDGPU::SRC_PRIVATE_LIMIT:
28473471bf0Spatrick         case AMDGPU::SGPR_NULL:
285*d415bd75Srobert         case AMDGPU::SGPR_NULL64:
28673471bf0Spatrick         case AMDGPU::MODE:
28773471bf0Spatrick           continue;
28873471bf0Spatrick 
28973471bf0Spatrick         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
29073471bf0Spatrick           llvm_unreachable("src_pops_exiting_wave_id should not be used");
29173471bf0Spatrick 
29273471bf0Spatrick         case AMDGPU::NoRegister:
29373471bf0Spatrick           assert(MI.isDebugInstr() &&
29473471bf0Spatrick                  "Instruction uses invalid noreg register");
29573471bf0Spatrick           continue;
29673471bf0Spatrick 
29773471bf0Spatrick         case AMDGPU::VCC:
29873471bf0Spatrick         case AMDGPU::VCC_LO:
29973471bf0Spatrick         case AMDGPU::VCC_HI:
30073471bf0Spatrick         case AMDGPU::VCC_LO_LO16:
30173471bf0Spatrick         case AMDGPU::VCC_LO_HI16:
30273471bf0Spatrick         case AMDGPU::VCC_HI_LO16:
30373471bf0Spatrick         case AMDGPU::VCC_HI_HI16:
30473471bf0Spatrick           Info.UsesVCC = true;
30573471bf0Spatrick           continue;
30673471bf0Spatrick 
30773471bf0Spatrick         case AMDGPU::FLAT_SCR:
30873471bf0Spatrick         case AMDGPU::FLAT_SCR_LO:
30973471bf0Spatrick         case AMDGPU::FLAT_SCR_HI:
31073471bf0Spatrick           continue;
31173471bf0Spatrick 
31273471bf0Spatrick         case AMDGPU::XNACK_MASK:
31373471bf0Spatrick         case AMDGPU::XNACK_MASK_LO:
31473471bf0Spatrick         case AMDGPU::XNACK_MASK_HI:
31573471bf0Spatrick           llvm_unreachable("xnack_mask registers should not be used");
31673471bf0Spatrick 
31773471bf0Spatrick         case AMDGPU::LDS_DIRECT:
31873471bf0Spatrick           llvm_unreachable("lds_direct register should not be used");
31973471bf0Spatrick 
32073471bf0Spatrick         case AMDGPU::TBA:
32173471bf0Spatrick         case AMDGPU::TBA_LO:
32273471bf0Spatrick         case AMDGPU::TBA_HI:
32373471bf0Spatrick         case AMDGPU::TMA:
32473471bf0Spatrick         case AMDGPU::TMA_LO:
32573471bf0Spatrick         case AMDGPU::TMA_HI:
32673471bf0Spatrick           llvm_unreachable("trap handler registers should not be used");
32773471bf0Spatrick 
32873471bf0Spatrick         case AMDGPU::SRC_VCCZ:
32973471bf0Spatrick           llvm_unreachable("src_vccz register should not be used");
33073471bf0Spatrick 
33173471bf0Spatrick         case AMDGPU::SRC_EXECZ:
33273471bf0Spatrick           llvm_unreachable("src_execz register should not be used");
33373471bf0Spatrick 
33473471bf0Spatrick         case AMDGPU::SRC_SCC:
33573471bf0Spatrick           llvm_unreachable("src_scc register should not be used");
33673471bf0Spatrick 
33773471bf0Spatrick         default:
33873471bf0Spatrick           break;
33973471bf0Spatrick         }
34073471bf0Spatrick 
34173471bf0Spatrick         if (AMDGPU::SReg_32RegClass.contains(Reg) ||
34273471bf0Spatrick             AMDGPU::SReg_LO16RegClass.contains(Reg) ||
34373471bf0Spatrick             AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
34473471bf0Spatrick           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
34573471bf0Spatrick                  "trap handler registers should not be used");
34673471bf0Spatrick           IsSGPR = true;
34773471bf0Spatrick           Width = 1;
34873471bf0Spatrick         } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
34973471bf0Spatrick                    AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
35073471bf0Spatrick                    AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
35173471bf0Spatrick           IsSGPR = false;
35273471bf0Spatrick           Width = 1;
35373471bf0Spatrick         } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
35473471bf0Spatrick                    AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
35573471bf0Spatrick           IsSGPR = false;
35673471bf0Spatrick           IsAGPR = true;
35773471bf0Spatrick           Width = 1;
35873471bf0Spatrick         } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
35973471bf0Spatrick           assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
36073471bf0Spatrick                  "trap handler registers should not be used");
36173471bf0Spatrick           IsSGPR = true;
36273471bf0Spatrick           Width = 2;
36373471bf0Spatrick         } else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
36473471bf0Spatrick           IsSGPR = false;
36573471bf0Spatrick           Width = 2;
36673471bf0Spatrick         } else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
36773471bf0Spatrick           IsSGPR = false;
36873471bf0Spatrick           IsAGPR = true;
36973471bf0Spatrick           Width = 2;
37073471bf0Spatrick         } else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
37173471bf0Spatrick           IsSGPR = false;
37273471bf0Spatrick           Width = 3;
37373471bf0Spatrick         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
37473471bf0Spatrick           IsSGPR = true;
37573471bf0Spatrick           Width = 3;
37673471bf0Spatrick         } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
37773471bf0Spatrick           IsSGPR = false;
37873471bf0Spatrick           IsAGPR = true;
37973471bf0Spatrick           Width = 3;
38073471bf0Spatrick         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
38173471bf0Spatrick           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
38273471bf0Spatrick                  "trap handler registers should not be used");
38373471bf0Spatrick           IsSGPR = true;
38473471bf0Spatrick           Width = 4;
38573471bf0Spatrick         } else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
38673471bf0Spatrick           IsSGPR = false;
38773471bf0Spatrick           Width = 4;
38873471bf0Spatrick         } else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
38973471bf0Spatrick           IsSGPR = false;
39073471bf0Spatrick           IsAGPR = true;
39173471bf0Spatrick           Width = 4;
39273471bf0Spatrick         } else if (AMDGPU::VReg_160RegClass.contains(Reg)) {
39373471bf0Spatrick           IsSGPR = false;
39473471bf0Spatrick           Width = 5;
39573471bf0Spatrick         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
39673471bf0Spatrick           IsSGPR = true;
39773471bf0Spatrick           Width = 5;
39873471bf0Spatrick         } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
39973471bf0Spatrick           IsSGPR = false;
40073471bf0Spatrick           IsAGPR = true;
40173471bf0Spatrick           Width = 5;
40273471bf0Spatrick         } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
40373471bf0Spatrick           IsSGPR = false;
40473471bf0Spatrick           Width = 6;
40573471bf0Spatrick         } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
40673471bf0Spatrick           IsSGPR = true;
40773471bf0Spatrick           Width = 6;
40873471bf0Spatrick         } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
40973471bf0Spatrick           IsSGPR = false;
41073471bf0Spatrick           IsAGPR = true;
41173471bf0Spatrick           Width = 6;
41273471bf0Spatrick         } else if (AMDGPU::VReg_224RegClass.contains(Reg)) {
41373471bf0Spatrick           IsSGPR = false;
41473471bf0Spatrick           Width = 7;
41573471bf0Spatrick         } else if (AMDGPU::SReg_224RegClass.contains(Reg)) {
41673471bf0Spatrick           IsSGPR = true;
41773471bf0Spatrick           Width = 7;
41873471bf0Spatrick         } else if (AMDGPU::AReg_224RegClass.contains(Reg)) {
41973471bf0Spatrick           IsSGPR = false;
42073471bf0Spatrick           IsAGPR = true;
42173471bf0Spatrick           Width = 7;
42273471bf0Spatrick         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
42373471bf0Spatrick           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
42473471bf0Spatrick                  "trap handler registers should not be used");
42573471bf0Spatrick           IsSGPR = true;
42673471bf0Spatrick           Width = 8;
42773471bf0Spatrick         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
42873471bf0Spatrick           IsSGPR = false;
42973471bf0Spatrick           Width = 8;
43073471bf0Spatrick         } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
43173471bf0Spatrick           IsSGPR = false;
43273471bf0Spatrick           IsAGPR = true;
43373471bf0Spatrick           Width = 8;
434*d415bd75Srobert         } else if (AMDGPU::VReg_288RegClass.contains(Reg)) {
435*d415bd75Srobert           IsSGPR = false;
436*d415bd75Srobert           Width = 9;
437*d415bd75Srobert         } else if (AMDGPU::SReg_288RegClass.contains(Reg)) {
438*d415bd75Srobert           IsSGPR = true;
439*d415bd75Srobert           Width = 9;
440*d415bd75Srobert         } else if (AMDGPU::AReg_288RegClass.contains(Reg)) {
441*d415bd75Srobert           IsSGPR = false;
442*d415bd75Srobert           IsAGPR = true;
443*d415bd75Srobert           Width = 9;
444*d415bd75Srobert         } else if (AMDGPU::VReg_320RegClass.contains(Reg)) {
445*d415bd75Srobert           IsSGPR = false;
446*d415bd75Srobert           Width = 10;
447*d415bd75Srobert         } else if (AMDGPU::SReg_320RegClass.contains(Reg)) {
448*d415bd75Srobert           IsSGPR = true;
449*d415bd75Srobert           Width = 10;
450*d415bd75Srobert         } else if (AMDGPU::AReg_320RegClass.contains(Reg)) {
451*d415bd75Srobert           IsSGPR = false;
452*d415bd75Srobert           IsAGPR = true;
453*d415bd75Srobert           Width = 10;
454*d415bd75Srobert         } else if (AMDGPU::VReg_352RegClass.contains(Reg)) {
455*d415bd75Srobert           IsSGPR = false;
456*d415bd75Srobert           Width = 11;
457*d415bd75Srobert         } else if (AMDGPU::SReg_352RegClass.contains(Reg)) {
458*d415bd75Srobert           IsSGPR = true;
459*d415bd75Srobert           Width = 11;
460*d415bd75Srobert         } else if (AMDGPU::AReg_352RegClass.contains(Reg)) {
461*d415bd75Srobert           IsSGPR = false;
462*d415bd75Srobert           IsAGPR = true;
463*d415bd75Srobert           Width = 11;
464*d415bd75Srobert         } else if (AMDGPU::VReg_384RegClass.contains(Reg)) {
465*d415bd75Srobert           IsSGPR = false;
466*d415bd75Srobert           Width = 12;
467*d415bd75Srobert         } else if (AMDGPU::SReg_384RegClass.contains(Reg)) {
468*d415bd75Srobert           IsSGPR = true;
469*d415bd75Srobert           Width = 12;
470*d415bd75Srobert         } else if (AMDGPU::AReg_384RegClass.contains(Reg)) {
471*d415bd75Srobert           IsSGPR = false;
472*d415bd75Srobert           IsAGPR = true;
473*d415bd75Srobert           Width = 12;
47473471bf0Spatrick         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
47573471bf0Spatrick           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
47673471bf0Spatrick                  "trap handler registers should not be used");
47773471bf0Spatrick           IsSGPR = true;
47873471bf0Spatrick           Width = 16;
47973471bf0Spatrick         } else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
48073471bf0Spatrick           IsSGPR = false;
48173471bf0Spatrick           Width = 16;
48273471bf0Spatrick         } else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
48373471bf0Spatrick           IsSGPR = false;
48473471bf0Spatrick           IsAGPR = true;
48573471bf0Spatrick           Width = 16;
48673471bf0Spatrick         } else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
48773471bf0Spatrick           IsSGPR = true;
48873471bf0Spatrick           Width = 32;
48973471bf0Spatrick         } else if (AMDGPU::VReg_1024RegClass.contains(Reg)) {
49073471bf0Spatrick           IsSGPR = false;
49173471bf0Spatrick           Width = 32;
49273471bf0Spatrick         } else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
49373471bf0Spatrick           IsSGPR = false;
49473471bf0Spatrick           IsAGPR = true;
49573471bf0Spatrick           Width = 32;
49673471bf0Spatrick         } else {
49773471bf0Spatrick           llvm_unreachable("Unknown register class");
49873471bf0Spatrick         }
49973471bf0Spatrick         unsigned HWReg = TRI.getHWRegIndex(Reg);
50073471bf0Spatrick         int MaxUsed = HWReg + Width - 1;
50173471bf0Spatrick         if (IsSGPR) {
50273471bf0Spatrick           MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
50373471bf0Spatrick         } else if (IsAGPR) {
50473471bf0Spatrick           MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
50573471bf0Spatrick         } else {
50673471bf0Spatrick           MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
50773471bf0Spatrick         }
50873471bf0Spatrick       }
50973471bf0Spatrick 
51073471bf0Spatrick       if (MI.isCall()) {
51173471bf0Spatrick         // Pseudo used just to encode the underlying global. Is there a better
51273471bf0Spatrick         // way to track this?
51373471bf0Spatrick 
51473471bf0Spatrick         const MachineOperand *CalleeOp =
51573471bf0Spatrick             TII->getNamedOperand(MI, AMDGPU::OpName::callee);
51673471bf0Spatrick 
51773471bf0Spatrick         const Function *Callee = getCalleeFunction(*CalleeOp);
51873471bf0Spatrick         DenseMap<const Function *, SIFunctionResourceInfo>::const_iterator I =
51973471bf0Spatrick             CallGraphResourceInfo.end();
52073471bf0Spatrick 
52173471bf0Spatrick         // Avoid crashing on undefined behavior with an illegal call to a
52273471bf0Spatrick         // kernel. If a callsite's calling convention doesn't match the
52373471bf0Spatrick         // function's, it's undefined behavior. If the callsite calling
52473471bf0Spatrick         // convention does match, that would have errored earlier.
52573471bf0Spatrick         if (Callee && AMDGPU::isEntryFunctionCC(Callee->getCallingConv()))
52673471bf0Spatrick           report_fatal_error("invalid call to entry function");
52773471bf0Spatrick 
52873471bf0Spatrick         bool IsIndirect = !Callee || Callee->isDeclaration();
52973471bf0Spatrick         if (!IsIndirect)
53073471bf0Spatrick           I = CallGraphResourceInfo.find(Callee);
53173471bf0Spatrick 
532*d415bd75Srobert         // FIXME: Call site could have norecurse on it
533*d415bd75Srobert         if (!Callee || !Callee->doesNotRecurse()) {
534*d415bd75Srobert           Info.HasRecursion = true;
535*d415bd75Srobert 
536*d415bd75Srobert           // TODO: If we happen to know there is no stack usage in the
537*d415bd75Srobert           // callgraph, we don't need to assume an infinitely growing stack.
538*d415bd75Srobert           if (!MI.isReturn()) {
539*d415bd75Srobert             // We don't need to assume an unknown stack size for tail calls.
540*d415bd75Srobert 
541*d415bd75Srobert             // FIXME: This only benefits in the case where the kernel does not
542*d415bd75Srobert             // directly call the tail called function. If a kernel directly
543*d415bd75Srobert             // calls a tail recursive function, we'll assume maximum stack size
544*d415bd75Srobert             // based on the regular call instruction.
545*d415bd75Srobert             CalleeFrameSize =
546*d415bd75Srobert               std::max(CalleeFrameSize,
547*d415bd75Srobert                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
548*d415bd75Srobert           }
549*d415bd75Srobert         }
550*d415bd75Srobert 
55173471bf0Spatrick         if (IsIndirect || I == CallGraphResourceInfo.end()) {
55273471bf0Spatrick           CalleeFrameSize =
55373471bf0Spatrick               std::max(CalleeFrameSize,
55473471bf0Spatrick                        static_cast<uint64_t>(AssumedStackSizeForExternalCall));
55573471bf0Spatrick 
55673471bf0Spatrick           // Register usage of indirect calls gets handled later
55773471bf0Spatrick           Info.UsesVCC = true;
55873471bf0Spatrick           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
55973471bf0Spatrick           Info.HasDynamicallySizedStack = true;
56073471bf0Spatrick           Info.HasIndirectCall = true;
56173471bf0Spatrick         } else {
56273471bf0Spatrick           // We force CodeGen to run in SCC order, so the callee's register
56373471bf0Spatrick           // usage etc. should be the cumulative usage of all callees.
56473471bf0Spatrick           MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
56573471bf0Spatrick           MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
56673471bf0Spatrick           MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
56773471bf0Spatrick           CalleeFrameSize =
56873471bf0Spatrick               std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
56973471bf0Spatrick           Info.UsesVCC |= I->second.UsesVCC;
57073471bf0Spatrick           Info.UsesFlatScratch |= I->second.UsesFlatScratch;
57173471bf0Spatrick           Info.HasDynamicallySizedStack |= I->second.HasDynamicallySizedStack;
57273471bf0Spatrick           Info.HasRecursion |= I->second.HasRecursion;
57373471bf0Spatrick           Info.HasIndirectCall |= I->second.HasIndirectCall;
57473471bf0Spatrick         }
57573471bf0Spatrick       }
57673471bf0Spatrick     }
57773471bf0Spatrick   }
57873471bf0Spatrick 
57973471bf0Spatrick   Info.NumExplicitSGPR = MaxSGPR + 1;
58073471bf0Spatrick   Info.NumVGPR = MaxVGPR + 1;
58173471bf0Spatrick   Info.NumAGPR = MaxAGPR + 1;
58273471bf0Spatrick   Info.PrivateSegmentSize += CalleeFrameSize;
58373471bf0Spatrick 
58473471bf0Spatrick   return Info;
58573471bf0Spatrick }
58673471bf0Spatrick 
propagateIndirectCallRegisterUsage()58773471bf0Spatrick void AMDGPUResourceUsageAnalysis::propagateIndirectCallRegisterUsage() {
58873471bf0Spatrick   // Collect the maximum number of registers from non-hardware-entrypoints.
58973471bf0Spatrick   // All these functions are potential targets for indirect calls.
59073471bf0Spatrick   int32_t NonKernelMaxSGPRs = 0;
59173471bf0Spatrick   int32_t NonKernelMaxVGPRs = 0;
59273471bf0Spatrick   int32_t NonKernelMaxAGPRs = 0;
59373471bf0Spatrick 
59473471bf0Spatrick   for (const auto &I : CallGraphResourceInfo) {
59573471bf0Spatrick     if (!AMDGPU::isEntryFunctionCC(I.getFirst()->getCallingConv())) {
59673471bf0Spatrick       auto &Info = I.getSecond();
59773471bf0Spatrick       NonKernelMaxSGPRs = std::max(NonKernelMaxSGPRs, Info.NumExplicitSGPR);
59873471bf0Spatrick       NonKernelMaxVGPRs = std::max(NonKernelMaxVGPRs, Info.NumVGPR);
59973471bf0Spatrick       NonKernelMaxAGPRs = std::max(NonKernelMaxAGPRs, Info.NumAGPR);
60073471bf0Spatrick     }
60173471bf0Spatrick   }
60273471bf0Spatrick 
60373471bf0Spatrick   // Add register usage for functions with indirect calls.
60473471bf0Spatrick   // For calls to unknown functions, we assume the maximum register usage of
60573471bf0Spatrick   // all non-hardware-entrypoints in the current module.
60673471bf0Spatrick   for (auto &I : CallGraphResourceInfo) {
60773471bf0Spatrick     auto &Info = I.getSecond();
60873471bf0Spatrick     if (Info.HasIndirectCall) {
60973471bf0Spatrick       Info.NumExplicitSGPR = std::max(Info.NumExplicitSGPR, NonKernelMaxSGPRs);
61073471bf0Spatrick       Info.NumVGPR = std::max(Info.NumVGPR, NonKernelMaxVGPRs);
61173471bf0Spatrick       Info.NumAGPR = std::max(Info.NumAGPR, NonKernelMaxAGPRs);
61273471bf0Spatrick     }
61373471bf0Spatrick   }
61473471bf0Spatrick }
615