1 //===- SIMachineFunctionInfo.cpp - SI Machine Function Info ---------------===//
2 //
3 //                     The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 
10 #include "SIMachineFunctionInfo.h"
11 #include "AMDGPUArgumentUsageInfo.h"
12 #include "AMDGPUSubtarget.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 #include "Utils/AMDGPUBaseInfo.h"
16 #include "llvm/ADT/Optional.h"
17 #include "llvm/CodeGen/MachineBasicBlock.h"
18 #include "llvm/CodeGen/MachineFrameInfo.h"
19 #include "llvm/CodeGen/MachineFunction.h"
20 #include "llvm/CodeGen/MachineRegisterInfo.h"
21 #include "llvm/IR/CallingConv.h"
22 #include "llvm/IR/Function.h"
23 #include <cassert>
24 #include <vector>
25 
26 #define MAX_LANES 64
27 
28 using namespace llvm;
29 
SIMachineFunctionInfo(const MachineFunction & MF)30 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
31   : AMDGPUMachineFunction(MF),
32     PrivateSegmentBuffer(false),
33     DispatchPtr(false),
34     QueuePtr(false),
35     KernargSegmentPtr(false),
36     DispatchID(false),
37     FlatScratchInit(false),
38     WorkGroupIDX(false),
39     WorkGroupIDY(false),
40     WorkGroupIDZ(false),
41     WorkGroupInfo(false),
42     PrivateSegmentWaveByteOffset(false),
43     WorkItemIDX(false),
44     WorkItemIDY(false),
45     WorkItemIDZ(false),
46     ImplicitBufferPtr(false),
47     ImplicitArgPtr(false),
48     GITPtrHigh(0xffffffff),
49     HighBitsOf32BitAddress(0) {
50   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
51   const Function &F = MF.getFunction();
52   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
53   WavesPerEU = ST.getWavesPerEU(F);
54 
55   Occupancy = getMaxWavesPerEU();
56   limitOccupancy(MF);
57   CallingConv::ID CC = F.getCallingConv();
58 
59   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
60     if (!F.arg_empty())
61       KernargSegmentPtr = true;
62     WorkGroupIDX = true;
63     WorkItemIDX = true;
64   } else if (CC == CallingConv::AMDGPU_PS) {
65     PSInputAddr = AMDGPU::getInitialPSInputAddr(F);
66   }
67 
68   if (!isEntryFunction()) {
69     // Non-entry functions have no special inputs for now, other registers
70     // required for scratch access.
71     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
72     ScratchWaveOffsetReg = AMDGPU::SGPR4;
73     FrameOffsetReg = AMDGPU::SGPR5;
74     StackPtrOffsetReg = AMDGPU::SGPR32;
75 
76     ArgInfo.PrivateSegmentBuffer =
77       ArgDescriptor::createRegister(ScratchRSrcReg);
78     ArgInfo.PrivateSegmentWaveByteOffset =
79       ArgDescriptor::createRegister(ScratchWaveOffsetReg);
80 
81     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
82       ImplicitArgPtr = true;
83   } else {
84     if (F.hasFnAttribute("amdgpu-implicitarg-ptr")) {
85       KernargSegmentPtr = true;
86       MaxKernArgAlign = std::max(ST.getAlignmentForImplicitArgPtr(),
87                                  MaxKernArgAlign);
88     }
89   }
90 
91   if (ST.debuggerEmitPrologue()) {
92     // Enable everything.
93     WorkGroupIDX = true;
94     WorkGroupIDY = true;
95     WorkGroupIDZ = true;
96     WorkItemIDX = true;
97     WorkItemIDY = true;
98     WorkItemIDZ = true;
99   } else {
100     if (F.hasFnAttribute("amdgpu-work-group-id-x"))
101       WorkGroupIDX = true;
102 
103     if (F.hasFnAttribute("amdgpu-work-group-id-y"))
104       WorkGroupIDY = true;
105 
106     if (F.hasFnAttribute("amdgpu-work-group-id-z"))
107       WorkGroupIDZ = true;
108 
109     if (F.hasFnAttribute("amdgpu-work-item-id-x"))
110       WorkItemIDX = true;
111 
112     if (F.hasFnAttribute("amdgpu-work-item-id-y"))
113       WorkItemIDY = true;
114 
115     if (F.hasFnAttribute("amdgpu-work-item-id-z"))
116       WorkItemIDZ = true;
117   }
118 
119   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
120   bool MaySpill = ST.isVGPRSpillingEnabled(F);
121   bool HasStackObjects = FrameInfo.hasStackObjects();
122 
123   if (isEntryFunction()) {
124     // X, XY, and XYZ are the only supported combinations, so make sure Y is
125     // enabled if Z is.
126     if (WorkItemIDZ)
127       WorkItemIDY = true;
128 
129     if (HasStackObjects || MaySpill) {
130       PrivateSegmentWaveByteOffset = true;
131 
132     // HS and GS always have the scratch wave offset in SGPR5 on GFX9.
133     if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 &&
134         (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS))
135       ArgInfo.PrivateSegmentWaveByteOffset
136         = ArgDescriptor::createRegister(AMDGPU::SGPR5);
137     }
138   }
139 
140   bool IsCOV2 = ST.isAmdCodeObjectV2(F);
141   if (IsCOV2) {
142     if (HasStackObjects || MaySpill)
143       PrivateSegmentBuffer = true;
144 
145     if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
146       DispatchPtr = true;
147 
148     if (F.hasFnAttribute("amdgpu-queue-ptr"))
149       QueuePtr = true;
150 
151     if (F.hasFnAttribute("amdgpu-dispatch-id"))
152       DispatchID = true;
153   } else if (ST.isMesaGfxShader(F)) {
154     if (HasStackObjects || MaySpill)
155       ImplicitBufferPtr = true;
156   }
157 
158   if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
159     KernargSegmentPtr = true;
160 
161   if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) {
162     // TODO: This could be refined a lot. The attribute is a poor way of
163     // detecting calls that may require it before argument lowering.
164     if (HasStackObjects || F.hasFnAttribute("amdgpu-flat-scratch"))
165       FlatScratchInit = true;
166   }
167 
168   Attribute A = F.getFnAttribute("amdgpu-git-ptr-high");
169   StringRef S = A.getValueAsString();
170   if (!S.empty())
171     S.consumeInteger(0, GITPtrHigh);
172 
173   A = F.getFnAttribute("amdgpu-32bit-address-high-bits");
174   S = A.getValueAsString();
175   if (!S.empty())
176     S.consumeInteger(0, HighBitsOf32BitAddress);
177 }
178 
limitOccupancy(const MachineFunction & MF)179 void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
180   limitOccupancy(getMaxWavesPerEU());
181   const GCNSubtarget& ST = MF.getSubtarget<GCNSubtarget>();
182   limitOccupancy(ST.getOccupancyWithLocalMemSize(getLDSSize(),
183                  MF.getFunction()));
184 }
185 
addPrivateSegmentBuffer(const SIRegisterInfo & TRI)186 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
187   const SIRegisterInfo &TRI) {
188   ArgInfo.PrivateSegmentBuffer =
189     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
190     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
191   NumUserSGPRs += 4;
192   return ArgInfo.PrivateSegmentBuffer.getRegister();
193 }
194 
addDispatchPtr(const SIRegisterInfo & TRI)195 unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
196   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
197     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
198   NumUserSGPRs += 2;
199   return ArgInfo.DispatchPtr.getRegister();
200 }
201 
addQueuePtr(const SIRegisterInfo & TRI)202 unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
203   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
204     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
205   NumUserSGPRs += 2;
206   return ArgInfo.QueuePtr.getRegister();
207 }
208 
addKernargSegmentPtr(const SIRegisterInfo & TRI)209 unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
210   ArgInfo.KernargSegmentPtr
211     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
212     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
213   NumUserSGPRs += 2;
214   return ArgInfo.KernargSegmentPtr.getRegister();
215 }
216 
addDispatchID(const SIRegisterInfo & TRI)217 unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
218   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
219     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
220   NumUserSGPRs += 2;
221   return ArgInfo.DispatchID.getRegister();
222 }
223 
addFlatScratchInit(const SIRegisterInfo & TRI)224 unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
225   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
226     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
227   NumUserSGPRs += 2;
228   return ArgInfo.FlatScratchInit.getRegister();
229 }
230 
addImplicitBufferPtr(const SIRegisterInfo & TRI)231 unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
232   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
233     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
234   NumUserSGPRs += 2;
235   return ArgInfo.ImplicitBufferPtr.getRegister();
236 }
237 
isCalleeSavedReg(const MCPhysReg * CSRegs,MCPhysReg Reg)238 static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
239   for (unsigned I = 0; CSRegs[I]; ++I) {
240     if (CSRegs[I] == Reg)
241       return true;
242   }
243 
244   return false;
245 }
246 
247 /// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
allocateSGPRSpillToVGPR(MachineFunction & MF,int FI)248 bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
249                                                     int FI) {
250   std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
251 
252   // This has already been allocated.
253   if (!SpillLanes.empty())
254     return true;
255 
256   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
257   const SIRegisterInfo *TRI = ST.getRegisterInfo();
258   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
259   MachineRegisterInfo &MRI = MF.getRegInfo();
260   unsigned WaveSize = ST.getWavefrontSize();
261 
262   unsigned Size = FrameInfo.getObjectSize(FI);
263   assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
264   assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
265 
266   int NumLanes = Size / 4;
267 
268   const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
269 
270   // Make sure to handle the case where a wide SGPR spill may span between two
271   // VGPRs.
272   for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
273     unsigned LaneVGPR;
274     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
275 
276     if (VGPRIndex == 0) {
277       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
278       if (LaneVGPR == AMDGPU::NoRegister) {
279         // We have no VGPRs left for spilling SGPRs. Reset because we will not
280         // partially spill the SGPR to VGPRs.
281         SGPRToVGPRSpills.erase(FI);
282         NumVGPRSpillLanes -= I;
283         return false;
284       }
285 
286       Optional<int> CSRSpillFI;
287       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
288           isCalleeSavedReg(CSRegs, LaneVGPR)) {
289         CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
290       }
291 
292       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
293 
294       // Add this register as live-in to all blocks to avoid machine verifer
295       // complaining about use of an undefined physical register.
296       for (MachineBasicBlock &BB : MF)
297         BB.addLiveIn(LaneVGPR);
298     } else {
299       LaneVGPR = SpillVGPRs.back().VGPR;
300     }
301 
302     SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
303   }
304 
305   return true;
306 }
307 
removeSGPRToVGPRFrameIndices(MachineFrameInfo & MFI)308 void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
309   for (auto &R : SGPRToVGPRSpills)
310     MFI.RemoveStackObject(R.first);
311 }
312 
313 
314 /// \returns VGPR used for \p Dim' work item ID.
getWorkItemIDVGPR(unsigned Dim) const315 unsigned SIMachineFunctionInfo::getWorkItemIDVGPR(unsigned Dim) const {
316   switch (Dim) {
317   case 0:
318     assert(hasWorkItemIDX());
319     return AMDGPU::VGPR0;
320   case 1:
321     assert(hasWorkItemIDY());
322     return AMDGPU::VGPR1;
323   case 2:
324     assert(hasWorkItemIDZ());
325     return AMDGPU::VGPR2;
326   }
327   llvm_unreachable("unexpected dimension");
328 }
329 
getNextUserSGPR() const330 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {
331   assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs");
332   return AMDGPU::SGPR0 + NumUserSGPRs;
333 }
334 
getNextSystemSGPR() const335 MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
336   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
337 }
338