1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LiveRegUnits.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18
19 using namespace llvm;
20
21 #define DEBUG_TYPE "frame-info"
22
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(true));
28
29 // Find a register matching \p RC from \p LiveUnits which is unused and
30 // available throughout the function. On failure, returns AMDGPU::NoRegister.
31 // TODO: Rewrite the loop here to iterate over MCRegUnits instead of
32 // MCRegisters. This should reduce the number of iterations and avoid redundant
33 // checking.
findUnusedRegister(MachineRegisterInfo & MRI,const LiveRegUnits & LiveUnits,const TargetRegisterClass & RC)34 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
35 const LiveRegUnits &LiveUnits,
36 const TargetRegisterClass &RC) {
37 for (MCRegister Reg : RC) {
38 if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
39 !MRI.isReserved(Reg))
40 return Reg;
41 }
42 return MCRegister();
43 }
44
45 // Find a scratch register that we can use in the prologue. We avoid using
46 // callee-save registers since they may appear to be free when this is called
47 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
48 // when this is called from emitPrologue.
findScratchNonCalleeSaveRegister(MachineRegisterInfo & MRI,LiveRegUnits & LiveUnits,const TargetRegisterClass & RC,bool Unused=false)49 static MCRegister findScratchNonCalleeSaveRegister(
50 MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
51 const TargetRegisterClass &RC, bool Unused = false) {
52 // Mark callee saved registers as used so we will not choose them.
53 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
54 for (unsigned i = 0; CSRegs[i]; ++i)
55 LiveUnits.addReg(CSRegs[i]);
56
57 // We are looking for a register that can be used throughout the entire
58 // function, so any use is unacceptable.
59 if (Unused)
60 return findUnusedRegister(MRI, LiveUnits, RC);
61
62 for (MCRegister Reg : RC) {
63 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
64 return Reg;
65 }
66
67 return MCRegister();
68 }
69
70 /// Query target location for spilling SGPRs
71 /// \p IncludeScratchCopy : Also look for free scratch SGPRs
getVGPRSpillLaneOrTempRegister(MachineFunction & MF,LiveRegUnits & LiveUnits,Register SGPR,const TargetRegisterClass & RC=AMDGPU::SReg_32_XM0_XEXECRegClass,bool IncludeScratchCopy=true)72 static void getVGPRSpillLaneOrTempRegister(
73 MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
74 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
75 bool IncludeScratchCopy = true) {
76 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
77 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
78
79 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80 const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 unsigned Size = TRI->getSpillSize(RC);
82 Align Alignment = TRI->getSpillAlign(RC);
83
84 // We need to save and restore the given SGPR.
85
86 Register ScratchSGPR;
87 // 1: Try to save the given register into an unused scratch SGPR. The
88 // LiveUnits should have all the callee saved registers marked as used. For
89 // certain cases we skip copy to scratch SGPR.
90 if (IncludeScratchCopy)
91 ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
92
93 if (!ScratchSGPR) {
94 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
95 TargetStackID::SGPRSpill);
96
97 if (TRI->spillSGPRToVGPR() &&
98 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99 /*IsPrologEpilog=*/true)) {
100 // 2: There's no free lane to spill, and no free register to save the
101 // SGPR, so we're forced to take another VGPR to use for the spill.
102 MFI->addToPrologEpilogSGPRSpills(
103 SGPR, PrologEpilogSGPRSaveRestoreInfo(
104 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
105
106 LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
107 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
108 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
109 << '\n';);
110 } else {
111 // Remove dead <FI> index
112 MF.getFrameInfo().RemoveStackObject(FI);
113 // 3: If all else fails, spill the register to memory.
114 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
115 MFI->addToPrologEpilogSGPRSpills(
116 SGPR,
117 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
118 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
119 << printReg(SGPR, TRI) << '\n');
120 }
121 } else {
122 MFI->addToPrologEpilogSGPRSpills(
123 SGPR, PrologEpilogSGPRSaveRestoreInfo(
124 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
125 LiveUnits.addReg(ScratchSGPR);
126 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
127 << printReg(ScratchSGPR, TRI) << '\n');
128 }
129 }
130
131 // We need to specially emit stack operations here because a different frame
132 // register is used than in the rest of the function, as getFrameRegister would
133 // use.
buildPrologSpill(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)134 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
135 const SIMachineFunctionInfo &FuncInfo,
136 LiveRegUnits &LiveUnits, MachineFunction &MF,
137 MachineBasicBlock &MBB,
138 MachineBasicBlock::iterator I, const DebugLoc &DL,
139 Register SpillReg, int FI, Register FrameReg,
140 int64_t DwordOff = 0) {
141 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
142 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
143
144 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
145 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
146 MachineMemOperand *MMO = MF.getMachineMemOperand(
147 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
148 FrameInfo.getObjectAlign(FI));
149 LiveUnits.addReg(SpillReg);
150 bool IsKill = !MBB.isLiveIn(SpillReg);
151 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
152 DwordOff, MMO, nullptr, &LiveUnits);
153 if (IsKill)
154 LiveUnits.removeReg(SpillReg);
155 }
156
buildEpilogRestore(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)157 static void buildEpilogRestore(const GCNSubtarget &ST,
158 const SIRegisterInfo &TRI,
159 const SIMachineFunctionInfo &FuncInfo,
160 LiveRegUnits &LiveUnits, MachineFunction &MF,
161 MachineBasicBlock &MBB,
162 MachineBasicBlock::iterator I,
163 const DebugLoc &DL, Register SpillReg, int FI,
164 Register FrameReg, int64_t DwordOff = 0) {
165 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
166 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
167
168 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
169 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
170 MachineMemOperand *MMO = MF.getMachineMemOperand(
171 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
172 FrameInfo.getObjectAlign(FI));
173 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
174 DwordOff, MMO, nullptr, &LiveUnits);
175 }
176
buildGitPtr(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,const SIInstrInfo * TII,Register TargetReg)177 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
178 const DebugLoc &DL, const SIInstrInfo *TII,
179 Register TargetReg) {
180 MachineFunction *MF = MBB.getParent();
181 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
182 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
183 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
184 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
185 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
186
187 if (MFI->getGITPtrHigh() != 0xffffffff) {
188 BuildMI(MBB, I, DL, SMovB32, TargetHi)
189 .addImm(MFI->getGITPtrHigh())
190 .addReg(TargetReg, RegState::ImplicitDefine);
191 } else {
192 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
193 BuildMI(MBB, I, DL, GetPC64, TargetReg);
194 }
195 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
196 MF->getRegInfo().addLiveIn(GitPtrLo);
197 MBB.addLiveIn(GitPtrLo);
198 BuildMI(MBB, I, DL, SMovB32, TargetLo)
199 .addReg(GitPtrLo);
200 }
201
initLiveUnits(LiveRegUnits & LiveUnits,const SIRegisterInfo & TRI,const SIMachineFunctionInfo * FuncInfo,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool IsProlog)202 static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
203 const SIMachineFunctionInfo *FuncInfo,
204 MachineFunction &MF, MachineBasicBlock &MBB,
205 MachineBasicBlock::iterator MBBI, bool IsProlog) {
206 if (LiveUnits.empty()) {
207 LiveUnits.init(TRI);
208 if (IsProlog) {
209 LiveUnits.addLiveIns(MBB);
210 } else {
211 // In epilog.
212 LiveUnits.addLiveOuts(MBB);
213 LiveUnits.stepBackward(*MBBI);
214 }
215 }
216 }
217
218 namespace llvm {
219
220 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
221 // BP, etc. These spills are delayed until the current function's frame is
222 // finalized. For a given register, the builder uses the
223 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
224 class PrologEpilogSGPRSpillBuilder {
225 MachineBasicBlock::iterator MI;
226 MachineBasicBlock &MBB;
227 MachineFunction &MF;
228 const GCNSubtarget &ST;
229 MachineFrameInfo &MFI;
230 SIMachineFunctionInfo *FuncInfo;
231 const SIInstrInfo *TII;
232 const SIRegisterInfo &TRI;
233 Register SuperReg;
234 const PrologEpilogSGPRSaveRestoreInfo SI;
235 LiveRegUnits &LiveUnits;
236 const DebugLoc &DL;
237 Register FrameReg;
238 ArrayRef<int16_t> SplitParts;
239 unsigned NumSubRegs;
240 unsigned EltSize = 4;
241
saveToMemory(const int FI) const242 void saveToMemory(const int FI) const {
243 MachineRegisterInfo &MRI = MF.getRegInfo();
244 assert(!MFI.isDeadObjectIndex(FI));
245
246 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
247
248 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
249 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
250 if (!TmpVGPR)
251 report_fatal_error("failed to find free scratch register");
252
253 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
254 Register SubReg = NumSubRegs == 1
255 ? SuperReg
256 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
257 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
258 .addReg(SubReg);
259
260 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
261 FI, FrameReg, DwordOff);
262 DwordOff += 4;
263 }
264 }
265
saveToVGPRLane(const int FI) const266 void saveToVGPRLane(const int FI) const {
267 assert(!MFI.isDeadObjectIndex(FI));
268
269 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
270 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
271 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
272 assert(Spill.size() == NumSubRegs);
273
274 for (unsigned I = 0; I < NumSubRegs; ++I) {
275 Register SubReg = NumSubRegs == 1
276 ? SuperReg
277 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
278 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
279 Spill[I].VGPR)
280 .addReg(SubReg)
281 .addImm(Spill[I].Lane)
282 .addReg(Spill[I].VGPR, RegState::Undef);
283 }
284 }
285
copyToScratchSGPR(Register DstReg) const286 void copyToScratchSGPR(Register DstReg) const {
287 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
288 .addReg(SuperReg)
289 .setMIFlag(MachineInstr::FrameSetup);
290 }
291
restoreFromMemory(const int FI)292 void restoreFromMemory(const int FI) {
293 MachineRegisterInfo &MRI = MF.getRegInfo();
294
295 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
296 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
297 MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
298 if (!TmpVGPR)
299 report_fatal_error("failed to find free scratch register");
300
301 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
302 Register SubReg = NumSubRegs == 1
303 ? SuperReg
304 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
305
306 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
307 TmpVGPR, FI, FrameReg, DwordOff);
308 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
309 .addReg(TmpVGPR, RegState::Kill);
310 DwordOff += 4;
311 }
312 }
313
restoreFromVGPRLane(const int FI)314 void restoreFromVGPRLane(const int FI) {
315 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
316 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
317 FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
318 assert(Spill.size() == NumSubRegs);
319
320 for (unsigned I = 0; I < NumSubRegs; ++I) {
321 Register SubReg = NumSubRegs == 1
322 ? SuperReg
323 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
324 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
325 .addReg(Spill[I].VGPR)
326 .addImm(Spill[I].Lane);
327 }
328 }
329
copyFromScratchSGPR(Register SrcReg) const330 void copyFromScratchSGPR(Register SrcReg) const {
331 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
332 .addReg(SrcReg)
333 .setMIFlag(MachineInstr::FrameDestroy);
334 }
335
336 public:
PrologEpilogSGPRSpillBuilder(Register Reg,const PrologEpilogSGPRSaveRestoreInfo SI,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,const SIInstrInfo * TII,const SIRegisterInfo & TRI,LiveRegUnits & LiveUnits,Register FrameReg)337 PrologEpilogSGPRSpillBuilder(Register Reg,
338 const PrologEpilogSGPRSaveRestoreInfo SI,
339 MachineBasicBlock &MBB,
340 MachineBasicBlock::iterator MI,
341 const DebugLoc &DL, const SIInstrInfo *TII,
342 const SIRegisterInfo &TRI,
343 LiveRegUnits &LiveUnits, Register FrameReg)
344 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
345 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
346 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
347 SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
348 FrameReg(FrameReg) {
349 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
350 SplitParts = TRI.getRegSplitParts(RC, EltSize);
351 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
352
353 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
354 }
355
save()356 void save() {
357 switch (SI.getKind()) {
358 case SGPRSaveKind::SPILL_TO_MEM:
359 return saveToMemory(SI.getIndex());
360 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
361 return saveToVGPRLane(SI.getIndex());
362 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
363 return copyToScratchSGPR(SI.getReg());
364 }
365 }
366
restore()367 void restore() {
368 switch (SI.getKind()) {
369 case SGPRSaveKind::SPILL_TO_MEM:
370 return restoreFromMemory(SI.getIndex());
371 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
372 return restoreFromVGPRLane(SI.getIndex());
373 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
374 return copyFromScratchSGPR(SI.getReg());
375 }
376 }
377 };
378
379 } // namespace llvm
380
381 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
emitEntryFunctionFlatScratchInit(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register ScratchWaveOffsetReg) const382 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
383 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
384 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
385 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
386 const SIInstrInfo *TII = ST.getInstrInfo();
387 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
388 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
389
390 // We don't need this if we only have spills since there is no user facing
391 // scratch.
392
393 // TODO: If we know we don't have flat instructions earlier, we can omit
394 // this from the input registers.
395 //
396 // TODO: We only need to know if we access scratch space through a flat
397 // pointer. Because we only detect if flat instructions are used at all,
398 // this will be used more often than necessary on VI.
399
400 Register FlatScrInitLo;
401 Register FlatScrInitHi;
402
403 if (ST.isAmdPalOS()) {
404 // Extract the scratch offset from the descriptor in the GIT
405 LiveRegUnits LiveUnits;
406 LiveUnits.init(*TRI);
407 LiveUnits.addLiveIns(MBB);
408
409 // Find unused reg to load flat scratch init into
410 MachineRegisterInfo &MRI = MF.getRegInfo();
411 Register FlatScrInit = AMDGPU::NoRegister;
412 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
413 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
414 AllSGPR64s = AllSGPR64s.slice(
415 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
416 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
417 for (MCPhysReg Reg : AllSGPR64s) {
418 if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
419 MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
420 FlatScrInit = Reg;
421 break;
422 }
423 }
424 assert(FlatScrInit && "Failed to find free register for scratch init");
425
426 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
427 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
428
429 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
430
431 // We now have the GIT ptr - now get the scratch descriptor from the entry
432 // at offset 0 (or offset 16 for a compute shader).
433 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
434 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
435 auto *MMO = MF.getMachineMemOperand(
436 PtrInfo,
437 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
438 MachineMemOperand::MODereferenceable,
439 8, Align(4));
440 unsigned Offset =
441 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
442 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
443 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
444 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
445 .addReg(FlatScrInit)
446 .addImm(EncodedOffset) // offset
447 .addImm(0) // cpol
448 .addMemOperand(MMO);
449
450 // Mask the offset in [47:0] of the descriptor
451 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
452 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
453 .addReg(FlatScrInitHi)
454 .addImm(0xffff);
455 And->getOperand(3).setIsDead(); // Mark SCC as dead.
456 } else {
457 Register FlatScratchInitReg =
458 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
459 assert(FlatScratchInitReg);
460
461 MachineRegisterInfo &MRI = MF.getRegInfo();
462 MRI.addLiveIn(FlatScratchInitReg);
463 MBB.addLiveIn(FlatScratchInitReg);
464
465 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
466 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
467 }
468
469 // Do a 64-bit pointer add.
470 if (ST.flatScratchIsPointer()) {
471 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
472 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
473 .addReg(FlatScrInitLo)
474 .addReg(ScratchWaveOffsetReg);
475 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
476 FlatScrInitHi)
477 .addReg(FlatScrInitHi)
478 .addImm(0);
479 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
480
481 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
482 addReg(FlatScrInitLo).
483 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
484 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
485 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
486 addReg(FlatScrInitHi).
487 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
488 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
489 return;
490 }
491
492 // For GFX9.
493 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
494 .addReg(FlatScrInitLo)
495 .addReg(ScratchWaveOffsetReg);
496 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
497 AMDGPU::FLAT_SCR_HI)
498 .addReg(FlatScrInitHi)
499 .addImm(0);
500 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
501
502 return;
503 }
504
505 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
506
507 // Copy the size in bytes.
508 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
509 .addReg(FlatScrInitHi, RegState::Kill);
510
511 // Add wave offset in bytes to private base offset.
512 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
513 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
514 .addReg(FlatScrInitLo)
515 .addReg(ScratchWaveOffsetReg);
516
517 // Convert offset to 256-byte units.
518 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
519 AMDGPU::FLAT_SCR_HI)
520 .addReg(FlatScrInitLo, RegState::Kill)
521 .addImm(8);
522 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
523 }
524
525 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
526 // memory. They should have been removed by now.
allStackObjectsAreDead(const MachineFrameInfo & MFI)527 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
528 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
529 I != E; ++I) {
530 if (!MFI.isDeadObjectIndex(I))
531 return false;
532 }
533
534 return true;
535 }
536
537 // Shift down registers reserved for the scratch RSRC.
getEntryFunctionReservedScratchRsrcReg(MachineFunction & MF) const538 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
539 MachineFunction &MF) const {
540
541 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
542 const SIInstrInfo *TII = ST.getInstrInfo();
543 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
544 MachineRegisterInfo &MRI = MF.getRegInfo();
545 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
546
547 assert(MFI->isEntryFunction());
548
549 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
550
551 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
552 allStackObjectsAreDead(MF.getFrameInfo())))
553 return Register();
554
555 if (ST.hasSGPRInitBug() ||
556 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
557 return ScratchRsrcReg;
558
559 // We reserved the last registers for this. Shift it down to the end of those
560 // which were actually used.
561 //
562 // FIXME: It might be safer to use a pseudoregister before replacement.
563
564 // FIXME: We should be able to eliminate unused input registers. We only
565 // cannot do this for the resources required for scratch access. For now we
566 // skip over user SGPRs and may leave unused holes.
567
568 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
569 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
570 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
571
572 // Skip the last N reserved elements because they should have already been
573 // reserved for VCC etc.
574 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
575 for (MCPhysReg Reg : AllSGPR128s) {
576 // Pick the first unallocated one. Make sure we don't clobber the other
577 // reserved input we needed. Also for PAL, make sure we don't clobber
578 // the GIT pointer passed in SGPR0 or SGPR8.
579 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
580 (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
581 MRI.replaceRegWith(ScratchRsrcReg, Reg);
582 MFI->setScratchRSrcReg(Reg);
583 return Reg;
584 }
585 }
586
587 return ScratchRsrcReg;
588 }
589
getScratchScaleFactor(const GCNSubtarget & ST)590 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
591 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
592 }
593
emitEntryFunctionPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const594 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
595 MachineBasicBlock &MBB) const {
596 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
597
598 // FIXME: If we only have SGPR spills, we won't actually be using scratch
599 // memory since these spill to VGPRs. We should be cleaning up these unused
600 // SGPR spill frame indices somewhere.
601
602 // FIXME: We still have implicit uses on SGPR spill instructions in case they
603 // need to spill to vector memory. It's likely that will not happen, but at
604 // this point it appears we need the setup. This part of the prolog should be
605 // emitted after frame indices are eliminated.
606
607 // FIXME: Remove all of the isPhysRegUsed checks
608
609 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
610 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
611 const SIInstrInfo *TII = ST.getInstrInfo();
612 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
613 MachineRegisterInfo &MRI = MF.getRegInfo();
614 const Function &F = MF.getFunction();
615 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
616
617 assert(MFI->isEntryFunction());
618
619 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
620 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
621
622 // We need to do the replacement of the private segment buffer register even
623 // if there are no stack objects. There could be stores to undef or a
624 // constant without an associated object.
625 //
626 // This will return `Register()` in cases where there are no actual
627 // uses of the SRSRC.
628 Register ScratchRsrcReg;
629 if (!ST.enableFlatScratch())
630 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
631
632 // Make the selected register live throughout the function.
633 if (ScratchRsrcReg) {
634 for (MachineBasicBlock &OtherBB : MF) {
635 if (&OtherBB != &MBB) {
636 OtherBB.addLiveIn(ScratchRsrcReg);
637 }
638 }
639 }
640
641 // Now that we have fixed the reserved SRSRC we need to locate the
642 // (potentially) preloaded SRSRC.
643 Register PreloadedScratchRsrcReg;
644 if (ST.isAmdHsaOrMesa(F)) {
645 PreloadedScratchRsrcReg =
646 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
647 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
648 // We added live-ins during argument lowering, but since they were not
649 // used they were deleted. We're adding the uses now, so add them back.
650 MRI.addLiveIn(PreloadedScratchRsrcReg);
651 MBB.addLiveIn(PreloadedScratchRsrcReg);
652 }
653 }
654
655 // Debug location must be unknown since the first debug location is used to
656 // determine the end of the prologue.
657 DebugLoc DL;
658 MachineBasicBlock::iterator I = MBB.begin();
659
660 // We found the SRSRC first because it needs four registers and has an
661 // alignment requirement. If the SRSRC that we found is clobbering with
662 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
663 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
664 // wave offset to a free SGPR.
665 Register ScratchWaveOffsetReg;
666 if (PreloadedScratchWaveOffsetReg &&
667 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
668 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
669 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
670 AllSGPRs = AllSGPRs.slice(
671 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
672 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
673 for (MCPhysReg Reg : AllSGPRs) {
674 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
675 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
676 ScratchWaveOffsetReg = Reg;
677 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
678 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
679 break;
680 }
681 }
682 } else {
683 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
684 }
685 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
686
687 if (requiresStackPointerReference(MF)) {
688 Register SPReg = MFI->getStackPtrOffsetReg();
689 assert(SPReg != AMDGPU::SP_REG);
690 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
691 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
692 }
693
694 if (hasFP(MF)) {
695 Register FPReg = MFI->getFrameOffsetReg();
696 assert(FPReg != AMDGPU::FP_REG);
697 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
698 }
699
700 bool NeedsFlatScratchInit =
701 MFI->getUserSGPRInfo().hasFlatScratchInit() &&
702 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
703 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
704
705 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
706 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
707 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
708 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
709 }
710
711 if (NeedsFlatScratchInit) {
712 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
713 }
714
715 if (ScratchRsrcReg) {
716 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
717 PreloadedScratchRsrcReg,
718 ScratchRsrcReg, ScratchWaveOffsetReg);
719 }
720 }
721
722 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
emitEntryFunctionScratchRsrcRegSetup(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register PreloadedScratchRsrcReg,Register ScratchRsrcReg,Register ScratchWaveOffsetReg) const723 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
724 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
725 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
726 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
727
728 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
729 const SIInstrInfo *TII = ST.getInstrInfo();
730 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
731 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
732 const Function &Fn = MF.getFunction();
733
734 if (ST.isAmdPalOS()) {
735 // The pointer to the GIT is formed from the offset passed in and either
736 // the amdgpu-git-ptr-high function attribute or the top part of the PC
737 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
738 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
739
740 buildGitPtr(MBB, I, DL, TII, Rsrc01);
741
742 // We now have the GIT ptr - now get the scratch descriptor from the entry
743 // at offset 0 (or offset 16 for a compute shader).
744 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
745 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
746 auto MMO = MF.getMachineMemOperand(PtrInfo,
747 MachineMemOperand::MOLoad |
748 MachineMemOperand::MOInvariant |
749 MachineMemOperand::MODereferenceable,
750 16, Align(4));
751 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
752 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
753 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
754 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
755 .addReg(Rsrc01)
756 .addImm(EncodedOffset) // offset
757 .addImm(0) // cpol
758 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
759 .addMemOperand(MMO);
760
761 // The driver will always set the SRD for wave 64 (bits 118:117 of
762 // descriptor / bits 22:21 of third sub-reg will be 0b11)
763 // If the shader is actually wave32 we have to modify the const_index_stride
764 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
765 // reason the driver does this is that there can be cases where it presents
766 // 2 shaders with different wave size (e.g. VsFs).
767 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
768 if (ST.isWave32()) {
769 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
770 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
771 .addImm(21)
772 .addReg(Rsrc03);
773 }
774 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
775 assert(!ST.isAmdHsaOrMesa(Fn));
776 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
777
778 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
779 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
780
781 // Use relocations to get the pointer, and setup the other bits manually.
782 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
783
784 if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
785 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
786
787 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
788 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
789
790 BuildMI(MBB, I, DL, Mov64, Rsrc01)
791 .addReg(MFI->getImplicitBufferPtrUserSGPR())
792 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
793 } else {
794 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
795
796 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
797 auto MMO = MF.getMachineMemOperand(
798 PtrInfo,
799 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
800 MachineMemOperand::MODereferenceable,
801 8, Align(4));
802 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
803 .addReg(MFI->getImplicitBufferPtrUserSGPR())
804 .addImm(0) // offset
805 .addImm(0) // cpol
806 .addMemOperand(MMO)
807 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
808
809 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
810 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
811 }
812 } else {
813 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
814 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
815
816 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
817 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
818 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
819
820 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
821 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
822 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
823 }
824
825 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
826 .addImm(Rsrc23 & 0xffffffff)
827 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
828
829 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
830 .addImm(Rsrc23 >> 32)
831 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
832 } else if (ST.isAmdHsaOrMesa(Fn)) {
833 assert(PreloadedScratchRsrcReg);
834
835 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
836 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
837 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
838 }
839 }
840
841 // Add the scratch wave offset into the scratch RSRC.
842 //
843 // We only want to update the first 48 bits, which is the base address
844 // pointer, without touching the adjacent 16 bits of flags. We know this add
845 // cannot carry-out from bit 47, otherwise the scratch allocation would be
846 // impossible to fit in the 48-bit global address space.
847 //
848 // TODO: Evaluate if it is better to just construct an SRD using the flat
849 // scratch init and some constants rather than update the one we are passed.
850 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
851 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
852
853 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
854 // the kernel body via inreg arguments.
855 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
856 .addReg(ScratchRsrcSub0)
857 .addReg(ScratchWaveOffsetReg)
858 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
859 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
860 .addReg(ScratchRsrcSub1)
861 .addImm(0)
862 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
863 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
864 }
865
isSupportedStackID(TargetStackID::Value ID) const866 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
867 switch (ID) {
868 case TargetStackID::Default:
869 case TargetStackID::NoAlloc:
870 case TargetStackID::SGPRSpill:
871 return true;
872 case TargetStackID::ScalableVector:
873 case TargetStackID::WasmLocal:
874 return false;
875 }
876 llvm_unreachable("Invalid TargetStackID::Value");
877 }
878
879 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
880 // Otherwise, activate all lanes. It returns the saved exec.
buildScratchExecCopy(LiveRegUnits & LiveUnits,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool IsProlog,bool EnableInactiveLanes)881 static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
882 MachineFunction &MF,
883 MachineBasicBlock &MBB,
884 MachineBasicBlock::iterator MBBI,
885 const DebugLoc &DL, bool IsProlog,
886 bool EnableInactiveLanes) {
887 Register ScratchExecCopy;
888 MachineRegisterInfo &MRI = MF.getRegInfo();
889 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
890 const SIInstrInfo *TII = ST.getInstrInfo();
891 const SIRegisterInfo &TRI = TII->getRegisterInfo();
892 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
893
894 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
895
896 ScratchExecCopy = findScratchNonCalleeSaveRegister(
897 MRI, LiveUnits, *TRI.getWaveMaskRegClass());
898 if (!ScratchExecCopy)
899 report_fatal_error("failed to find free scratch register");
900
901 LiveUnits.addReg(ScratchExecCopy);
902
903 const unsigned SaveExecOpc =
904 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
905 : AMDGPU::S_OR_SAVEEXEC_B32)
906 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
907 : AMDGPU::S_OR_SAVEEXEC_B64);
908 auto SaveExec =
909 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
910 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
911
912 return ScratchExecCopy;
913 }
914
emitCSRSpillStores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LiveRegUnits & LiveUnits,Register FrameReg,Register FramePtrRegScratchCopy) const915 void SIFrameLowering::emitCSRSpillStores(
916 MachineFunction &MF, MachineBasicBlock &MBB,
917 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
918 Register FrameReg, Register FramePtrRegScratchCopy) const {
919 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
920 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
921 const SIInstrInfo *TII = ST.getInstrInfo();
922 const SIRegisterInfo &TRI = TII->getRegisterInfo();
923
924 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
925 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
926 // might end up flipping the EXEC bits twice.
927 Register ScratchExecCopy;
928 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
929 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
930 if (!WWMScratchRegs.empty())
931 ScratchExecCopy =
932 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
933 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
934
935 auto StoreWWMRegisters =
936 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
937 for (const auto &Reg : WWMRegs) {
938 Register VGPR = Reg.first;
939 int FI = Reg.second;
940 buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
941 VGPR, FI, FrameReg);
942 }
943 };
944
945 StoreWWMRegisters(WWMScratchRegs);
946 if (!WWMCalleeSavedRegs.empty()) {
947 if (ScratchExecCopy) {
948 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
949 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
950 } else {
951 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
952 /*IsProlog*/ true,
953 /*EnableInactiveLanes*/ false);
954 }
955 }
956
957 StoreWWMRegisters(WWMCalleeSavedRegs);
958 if (ScratchExecCopy) {
959 // FIXME: Split block and make terminator.
960 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
961 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
962 .addReg(ScratchExecCopy, RegState::Kill);
963 LiveUnits.addReg(ScratchExecCopy);
964 }
965
966 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
967
968 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
969 // Special handle FP spill:
970 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
971 // Otherwise, FP has been moved to a temporary register and spill it
972 // instead.
973 Register Reg =
974 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
975 if (!Reg)
976 continue;
977
978 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
979 LiveUnits, FrameReg);
980 SB.save();
981 }
982
983 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
984 // such scratch registers live throughout the function.
985 SmallVector<Register, 1> ScratchSGPRs;
986 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
987 if (!ScratchSGPRs.empty()) {
988 for (MachineBasicBlock &MBB : MF) {
989 for (MCPhysReg Reg : ScratchSGPRs)
990 MBB.addLiveIn(Reg);
991
992 MBB.sortUniqueLiveIns();
993 }
994 if (!LiveUnits.empty()) {
995 for (MCPhysReg Reg : ScratchSGPRs)
996 LiveUnits.addReg(Reg);
997 }
998 }
999 }
1000
emitCSRSpillRestores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LiveRegUnits & LiveUnits,Register FrameReg,Register FramePtrRegScratchCopy) const1001 void SIFrameLowering::emitCSRSpillRestores(
1002 MachineFunction &MF, MachineBasicBlock &MBB,
1003 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
1004 Register FrameReg, Register FramePtrRegScratchCopy) const {
1005 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1006 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1007 const SIInstrInfo *TII = ST.getInstrInfo();
1008 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1009 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1010
1011 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1012 // Special handle FP restore:
1013 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1014 // the FP value to a temporary register. The frame pointer should be
1015 // overwritten only at the end when all other spills are restored from
1016 // current frame.
1017 Register Reg =
1018 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1019 if (!Reg)
1020 continue;
1021
1022 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1023 LiveUnits, FrameReg);
1024 SB.restore();
1025 }
1026
1027 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1028 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1029 // this, we might end up flipping the EXEC bits twice.
1030 Register ScratchExecCopy;
1031 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1032 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1033 if (!WWMScratchRegs.empty())
1034 ScratchExecCopy =
1035 buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1036 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1037
1038 auto RestoreWWMRegisters =
1039 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1040 for (const auto &Reg : WWMRegs) {
1041 Register VGPR = Reg.first;
1042 int FI = Reg.second;
1043 buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
1044 VGPR, FI, FrameReg);
1045 }
1046 };
1047
1048 RestoreWWMRegisters(WWMScratchRegs);
1049 if (!WWMCalleeSavedRegs.empty()) {
1050 if (ScratchExecCopy) {
1051 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1052 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
1053 } else {
1054 ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
1055 /*IsProlog*/ false,
1056 /*EnableInactiveLanes*/ false);
1057 }
1058 }
1059
1060 RestoreWWMRegisters(WWMCalleeSavedRegs);
1061 if (ScratchExecCopy) {
1062 // FIXME: Split block and make terminator.
1063 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1064 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
1065 .addReg(ScratchExecCopy, RegState::Kill);
1066 }
1067 }
1068
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1069 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1070 MachineBasicBlock &MBB) const {
1071 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1072 if (FuncInfo->isEntryFunction()) {
1073 emitEntryFunctionPrologue(MF, MBB);
1074 return;
1075 }
1076
1077 MachineFrameInfo &MFI = MF.getFrameInfo();
1078 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1079 const SIInstrInfo *TII = ST.getInstrInfo();
1080 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1081 MachineRegisterInfo &MRI = MF.getRegInfo();
1082
1083 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1084 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1085 Register BasePtrReg =
1086 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1087 LiveRegUnits LiveUnits;
1088
1089 MachineBasicBlock::iterator MBBI = MBB.begin();
1090 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1091 // to determine the end of the prologue.
1092 DebugLoc DL;
1093
1094 if (FuncInfo->isChainFunction()) {
1095 // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
1096 // are free to set one up if they need it.
1097 bool UseSP = requiresStackPointerReference(MF);
1098 if (UseSP) {
1099 assert(StackPtrReg != AMDGPU::SP_REG);
1100
1101 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
1102 .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
1103 }
1104 }
1105
1106 bool HasFP = false;
1107 bool HasBP = false;
1108 uint32_t NumBytes = MFI.getStackSize();
1109 uint32_t RoundedSize = NumBytes;
1110
1111 if (TRI.hasStackRealignment(MF))
1112 HasFP = true;
1113
1114 Register FramePtrRegScratchCopy;
1115 if (!HasFP && !hasFP(MF)) {
1116 // Emit the CSR spill stores with SP base register.
1117 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
1118 FuncInfo->isChainFunction() ? Register() : StackPtrReg,
1119 FramePtrRegScratchCopy);
1120 } else {
1121 // CSR spill stores will use FP as base register.
1122 Register SGPRForFPSaveRestoreCopy =
1123 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1124
1125 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1126 if (SGPRForFPSaveRestoreCopy) {
1127 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1128 // the extra FP copy needed in the other two cases when FP is spilled to
1129 // memory or to a VGPR lane.
1130 PrologEpilogSGPRSpillBuilder SB(
1131 FramePtrReg,
1132 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1133 DL, TII, TRI, LiveUnits, FramePtrReg);
1134 SB.save();
1135 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1136 } else {
1137 // Copy FP into a new scratch register so that its previous value can be
1138 // spilled after setting up the new frame.
1139 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1140 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1141 if (!FramePtrRegScratchCopy)
1142 report_fatal_error("failed to find free scratch register");
1143
1144 LiveUnits.addReg(FramePtrRegScratchCopy);
1145 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1146 .addReg(FramePtrReg);
1147 }
1148 }
1149
1150 if (HasFP) {
1151 const unsigned Alignment = MFI.getMaxAlign().value();
1152
1153 RoundedSize += Alignment;
1154 if (LiveUnits.empty()) {
1155 LiveUnits.init(TRI);
1156 LiveUnits.addLiveIns(MBB);
1157 }
1158
1159 // s_add_i32 s33, s32, NumBytes
1160 // s_and_b32 s33, s33, 0b111...0000
1161 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1162 .addReg(StackPtrReg)
1163 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1164 .setMIFlag(MachineInstr::FrameSetup);
1165 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1166 .addReg(FramePtrReg, RegState::Kill)
1167 .addImm(-Alignment * getScratchScaleFactor(ST))
1168 .setMIFlag(MachineInstr::FrameSetup);
1169 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1170 FuncInfo->setIsStackRealigned(true);
1171 } else if ((HasFP = hasFP(MF))) {
1172 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1173 .addReg(StackPtrReg)
1174 .setMIFlag(MachineInstr::FrameSetup);
1175 }
1176
1177 // If FP is used, emit the CSR spills with FP base register.
1178 if (HasFP) {
1179 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1180 FramePtrRegScratchCopy);
1181 if (FramePtrRegScratchCopy)
1182 LiveUnits.removeReg(FramePtrRegScratchCopy);
1183 }
1184
1185 // If we need a base pointer, set it up here. It's whatever the value of
1186 // the stack pointer is at this point. Any variable size objects will be
1187 // allocated after this, so we can still use the base pointer to reference
1188 // the incoming arguments.
1189 if ((HasBP = TRI.hasBasePointer(MF))) {
1190 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1191 .addReg(StackPtrReg)
1192 .setMIFlag(MachineInstr::FrameSetup);
1193 }
1194
1195 if (HasFP && RoundedSize != 0) {
1196 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1197 .addReg(StackPtrReg)
1198 .addImm(RoundedSize * getScratchScaleFactor(ST))
1199 .setMIFlag(MachineInstr::FrameSetup);
1200 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1201 }
1202
1203 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1204 (void)FPSaved;
1205 assert((!HasFP || FPSaved) &&
1206 "Needed to save FP but didn't save it anywhere");
1207
1208 // If we allow spilling to AGPRs we may have saved FP but then spill
1209 // everything into AGPRs instead of the stack.
1210 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1211 "Saved FP but didn't need it");
1212
1213 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1214 (void)BPSaved;
1215 assert((!HasBP || BPSaved) &&
1216 "Needed to save BP but didn't save it anywhere");
1217
1218 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1219 }
1220
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1221 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1222 MachineBasicBlock &MBB) const {
1223 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1224 if (FuncInfo->isEntryFunction())
1225 return;
1226
1227 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1228 const SIInstrInfo *TII = ST.getInstrInfo();
1229 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1230 MachineRegisterInfo &MRI = MF.getRegInfo();
1231 LiveRegUnits LiveUnits;
1232 // Get the insert location for the epilogue. If there were no terminators in
1233 // the block, get the last instruction.
1234 MachineBasicBlock::iterator MBBI = MBB.end();
1235 DebugLoc DL;
1236 if (!MBB.empty()) {
1237 MBBI = MBB.getLastNonDebugInstr();
1238 if (MBBI != MBB.end())
1239 DL = MBBI->getDebugLoc();
1240
1241 MBBI = MBB.getFirstTerminator();
1242 }
1243
1244 const MachineFrameInfo &MFI = MF.getFrameInfo();
1245 uint32_t NumBytes = MFI.getStackSize();
1246 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1247 ? NumBytes + MFI.getMaxAlign().value()
1248 : NumBytes;
1249 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1250 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1251 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1252
1253 Register FramePtrRegScratchCopy;
1254 Register SGPRForFPSaveRestoreCopy =
1255 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1256 if (FPSaved) {
1257 // CSR spill restores should use FP as base register. If
1258 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1259 // into a new scratch register and copy to FP later when other registers are
1260 // restored from the current stack frame.
1261 initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1262 if (SGPRForFPSaveRestoreCopy) {
1263 LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
1264 } else {
1265 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1266 MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
1267 if (!FramePtrRegScratchCopy)
1268 report_fatal_error("failed to find free scratch register");
1269
1270 LiveUnits.addReg(FramePtrRegScratchCopy);
1271 }
1272
1273 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
1274 FramePtrRegScratchCopy);
1275 }
1276
1277 if (RoundedSize != 0 && hasFP(MF)) {
1278 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1279 .addReg(StackPtrReg)
1280 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1281 .setMIFlag(MachineInstr::FrameDestroy);
1282 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1283 }
1284
1285 if (FPSaved) {
1286 // Insert the copy to restore FP.
1287 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1288 : FramePtrRegScratchCopy;
1289 MachineInstrBuilder MIB =
1290 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1291 .addReg(SrcReg);
1292 if (SGPRForFPSaveRestoreCopy)
1293 MIB.setMIFlag(MachineInstr::FrameDestroy);
1294 } else {
1295 // Insert the CSR spill restores with SP as the base register.
1296 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
1297 FramePtrRegScratchCopy);
1298 }
1299 }
1300
1301 #ifndef NDEBUG
allSGPRSpillsAreDead(const MachineFunction & MF)1302 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1303 const MachineFrameInfo &MFI = MF.getFrameInfo();
1304 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1305 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1306 I != E; ++I) {
1307 if (!MFI.isDeadObjectIndex(I) &&
1308 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1309 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1310 return false;
1311 }
1312 }
1313
1314 return true;
1315 }
1316 #endif
1317
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1318 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1319 int FI,
1320 Register &FrameReg) const {
1321 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1322
1323 FrameReg = RI->getFrameRegister(MF);
1324 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1325 }
1326
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const1327 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1328 MachineFunction &MF,
1329 RegScavenger *RS) const {
1330 MachineFrameInfo &MFI = MF.getFrameInfo();
1331
1332 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1333 const SIInstrInfo *TII = ST.getInstrInfo();
1334 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1335 MachineRegisterInfo &MRI = MF.getRegInfo();
1336 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1337
1338 // Allocate spill slots for WWM reserved VGPRs.
1339 // For chain functions, we only need to do this if we have calls to
1340 // llvm.amdgcn.cs.chain.
1341 bool IsChainWithoutCalls =
1342 FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
1343 if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
1344 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1345 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1346 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1347 TRI->getSpillAlign(*RC));
1348 }
1349 }
1350
1351 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1352 && EnableSpillVGPRToAGPR;
1353
1354 if (SpillVGPRToAGPR) {
1355 // To track the spill frame indices handled in this pass.
1356 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1357 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1358
1359 bool SeenDbgInstr = false;
1360
1361 for (MachineBasicBlock &MBB : MF) {
1362 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1363 int FrameIndex;
1364 if (MI.isDebugInstr())
1365 SeenDbgInstr = true;
1366
1367 if (TII->isVGPRSpill(MI)) {
1368 // Try to eliminate stack used by VGPR spills before frame
1369 // finalization.
1370 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1371 AMDGPU::OpName::vaddr);
1372 int FI = MI.getOperand(FIOp).getIndex();
1373 Register VReg =
1374 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1375 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1376 TRI->isAGPR(MRI, VReg))) {
1377 assert(RS != nullptr);
1378 RS->enterBasicBlockEnd(MBB);
1379 RS->backward(std::next(MI.getIterator()));
1380 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1381 SpillFIs.set(FI);
1382 continue;
1383 }
1384 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1385 TII->isLoadFromStackSlot(MI, FrameIndex))
1386 if (!MFI.isFixedObjectIndex(FrameIndex))
1387 NonVGPRSpillFIs.set(FrameIndex);
1388 }
1389 }
1390
1391 // Stack slot coloring may assign different objects to the same stack slot.
1392 // If not, then the VGPR to AGPR spill slot is dead.
1393 for (unsigned FI : SpillFIs.set_bits())
1394 if (!NonVGPRSpillFIs.test(FI))
1395 FuncInfo->setVGPRToAGPRSpillDead(FI);
1396
1397 for (MachineBasicBlock &MBB : MF) {
1398 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1399 MBB.addLiveIn(Reg);
1400
1401 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1402 MBB.addLiveIn(Reg);
1403
1404 MBB.sortUniqueLiveIns();
1405
1406 if (!SpillFIs.empty() && SeenDbgInstr) {
1407 // FIXME: The dead frame indices are replaced with a null register from
1408 // the debug value instructions. We should instead, update it with the
1409 // correct register value. But not sure the register value alone is
1410 for (MachineInstr &MI : MBB) {
1411 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1412 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1413 SpillFIs[MI.getOperand(0).getIndex()]) {
1414 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1415 }
1416 }
1417 }
1418 }
1419 }
1420
1421 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1422 // can. Any remaining SGPR spills will go to memory, so move them back to the
1423 // default stack.
1424 bool HaveSGPRToVMemSpill =
1425 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1426 assert(allSGPRSpillsAreDead(MF) &&
1427 "SGPR spill should have been removed in SILowerSGPRSpills");
1428
1429 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1430 // but currently hasNonSpillStackObjects is set only from source
1431 // allocas. Stack temps produced from legalization are not counted currently.
1432 if (!allStackObjectsAreDead(MFI)) {
1433 assert(RS && "RegScavenger required if spilling");
1434
1435 // Add an emergency spill slot
1436 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1437
1438 // If we are spilling SGPRs to memory with a large frame, we may need a
1439 // second VGPR emergency frame index.
1440 if (HaveSGPRToVMemSpill &&
1441 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1442 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1443 }
1444 }
1445 }
1446
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS) const1447 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1448 MachineFunction &MF, RegScavenger *RS) const {
1449 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1450 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1451 MachineRegisterInfo &MRI = MF.getRegInfo();
1452 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1453
1454 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1455 // On gfx908, we had initially reserved highest available VGPR for AGPR
1456 // copy. Now since we are done with RA, check if there exist an unused VGPR
1457 // which is lower than the eariler reserved VGPR before RA. If one exist,
1458 // use it for AGPR copy instead of one reserved before RA.
1459 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1460 Register UnusedLowVGPR =
1461 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1462 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1463 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1464 // Reserve this newly identified VGPR (for AGPR copy)
1465 // reserved registers should already be frozen at this point
1466 // so we can avoid calling MRI.freezeReservedRegs and just use
1467 // MRI.reserveReg
1468 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1469 MRI.reserveReg(UnusedLowVGPR, TRI);
1470 }
1471 }
1472 // We initally reserved the highest available SGPR pair for long branches
1473 // now, after RA, we shift down to a lower unused one if one exists
1474 Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
1475 Register UnusedLowSGPR =
1476 TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
1477 // If LongBranchReservedReg is null then we didn't find a long branch
1478 // and never reserved a register to begin with so there is nothing to
1479 // shift down. Then if UnusedLowSGPR is null, there isn't available lower
1480 // register to use so just keep the original one we set.
1481 if (LongBranchReservedReg && UnusedLowSGPR) {
1482 FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
1483 MRI.reserveReg(UnusedLowSGPR, TRI);
1484 }
1485 }
1486
1487 // The special SGPR spills like the one needed for FP, BP or any reserved
1488 // registers delayed until frame lowering.
determinePrologEpilogSGPRSaves(MachineFunction & MF,BitVector & SavedVGPRs,bool NeedExecCopyReservedReg) const1489 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1490 MachineFunction &MF, BitVector &SavedVGPRs,
1491 bool NeedExecCopyReservedReg) const {
1492 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1493 MachineRegisterInfo &MRI = MF.getRegInfo();
1494 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1495 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1496 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1497 LiveRegUnits LiveUnits;
1498 LiveUnits.init(*TRI);
1499 // Initially mark callee saved registers as used so we will not choose them
1500 // while looking for scratch SGPRs.
1501 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1502 for (unsigned I = 0; CSRegs[I]; ++I)
1503 LiveUnits.addReg(CSRegs[I]);
1504
1505 const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
1506
1507 if (NeedExecCopyReservedReg) {
1508 Register ReservedReg = MFI->getSGPRForEXECCopy();
1509 assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
1510 Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
1511 if (UnusedScratchReg) {
1512 // If found any unused scratch SGPR, reserve the register itself for Exec
1513 // copy and there is no need for any spills in that case.
1514 MFI->setSGPRForEXECCopy(UnusedScratchReg);
1515 LiveUnits.addReg(UnusedScratchReg);
1516 } else {
1517 // Needs spill.
1518 assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
1519 "Re-reserving spill slot for EXEC copy register");
1520 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
1521 /*IncludeScratchCopy=*/false);
1522 }
1523 }
1524
1525 // hasFP only knows about stack objects that already exist. We're now
1526 // determining the stack slots that will be created, so we have to predict
1527 // them. Stack objects force FP usage with calls.
1528 //
1529 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1530 // don't want to report it here.
1531 //
1532 // FIXME: Is this really hasReservedCallFrame?
1533 const bool WillHaveFP =
1534 FrameInfo.hasCalls() &&
1535 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1536
1537 if (WillHaveFP || hasFP(MF)) {
1538 Register FramePtrReg = MFI->getFrameOffsetReg();
1539 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1540 "Re-reserving spill slot for FP");
1541 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
1542 }
1543
1544 if (TRI->hasBasePointer(MF)) {
1545 Register BasePtrReg = TRI->getBaseRegister();
1546 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1547 "Re-reserving spill slot for BP");
1548 getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
1549 }
1550 }
1551
1552 // Only report VGPRs to generic code.
determineCalleeSaves(MachineFunction & MF,BitVector & SavedVGPRs,RegScavenger * RS) const1553 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1554 BitVector &SavedVGPRs,
1555 RegScavenger *RS) const {
1556 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1557
1558 // If this is a function with the amdgpu_cs_chain[_preserve] calling
1559 // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
1560 // we don't need to save and restore anything.
1561 if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
1562 return;
1563
1564 MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1565
1566 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1567 if (MFI->isEntryFunction())
1568 return;
1569
1570 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1571 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1572 const SIInstrInfo *TII = ST.getInstrInfo();
1573 bool NeedExecCopyReservedReg = false;
1574
1575 MachineInstr *ReturnMI = nullptr;
1576 for (MachineBasicBlock &MBB : MF) {
1577 for (MachineInstr &MI : MBB) {
1578 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1579 // lanes of VGPRs and callee must spill and restore them even if they are
1580 // marked Caller-saved.
1581
1582 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1583 // here would be a bad heuristic. A better way should be by calling
1584 // allocateWWMSpill during the regalloc pipeline whenever a physical
1585 // register is allocated for the intended virtual registers.
1586 if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
1587 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1588 else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
1589 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1590 else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
1591 NeedExecCopyReservedReg = true;
1592 else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
1593 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1594 (MFI->isChainFunction() &&
1595 TII->isChainCallOpcode(MI.getOpcode()))) {
1596 // We expect all return to be the same size.
1597 assert(!ReturnMI ||
1598 (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
1599 count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
1600 ReturnMI = &MI;
1601 }
1602 }
1603 }
1604
1605 // Remove any VGPRs used in the return value because these do not need to be saved.
1606 // This prevents CSR restore from clobbering return VGPRs.
1607 if (ReturnMI) {
1608 for (auto &Op : ReturnMI->operands()) {
1609 if (Op.isReg())
1610 SavedVGPRs.reset(Op.getReg());
1611 }
1612 }
1613
1614 // Ignore the SGPRs the default implementation found.
1615 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1616
1617 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1618 // In gfx908 there was do AGPR loads and stores and thus spilling also
1619 // require a temporary VGPR.
1620 if (!ST.hasGFX90AInsts())
1621 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1622
1623 determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
1624
1625 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1626 // allow the default insertion to handle them.
1627 for (auto &Reg : MFI->getWWMSpills())
1628 SavedVGPRs.reset(Reg.first);
1629
1630 // Mark all lane VGPRs as BB LiveIns.
1631 for (MachineBasicBlock &MBB : MF) {
1632 for (auto &Reg : MFI->getWWMSpills())
1633 MBB.addLiveIn(Reg.first);
1634
1635 MBB.sortUniqueLiveIns();
1636 }
1637 }
1638
determineCalleeSavesSGPR(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const1639 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1640 BitVector &SavedRegs,
1641 RegScavenger *RS) const {
1642 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1643 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1644 if (MFI->isEntryFunction())
1645 return;
1646
1647 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1648 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1649
1650 // The SP is specifically managed and we don't want extra spills of it.
1651 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1652
1653 const BitVector AllSavedRegs = SavedRegs;
1654 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1655
1656 // We have to anticipate introducing CSR VGPR spills or spill of caller
1657 // save VGPR reserved for SGPR spills as we now always create stack entry
1658 // for it, if we don't have any stack objects already, since we require a FP
1659 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1660 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1661 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1662 const bool WillHaveFP =
1663 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1664
1665 // FP will be specially managed like SP.
1666 if (WillHaveFP || hasFP(MF))
1667 SavedRegs.reset(MFI->getFrameOffsetReg());
1668
1669 // Return address use with return instruction is hidden through the SI_RETURN
1670 // pseudo. Given that and since the IPRA computes actual register usage and
1671 // does not use CSR list, the clobbering of return address by function calls
1672 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1673 // usage collection. This will ensure save/restore of return address happens
1674 // in those scenarios.
1675 const MachineRegisterInfo &MRI = MF.getRegInfo();
1676 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1677 if (!MFI->isEntryFunction() &&
1678 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1679 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1680 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1681 }
1682 }
1683
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const1684 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1685 MachineFunction &MF, const TargetRegisterInfo *TRI,
1686 std::vector<CalleeSavedInfo> &CSI) const {
1687 if (CSI.empty())
1688 return true; // Early exit if no callee saved registers are modified!
1689
1690 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1691 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1692 const SIRegisterInfo *RI = ST.getRegisterInfo();
1693 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1694 Register BasePtrReg = RI->getBaseRegister();
1695 Register SGPRForFPSaveRestoreCopy =
1696 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1697 Register SGPRForBPSaveRestoreCopy =
1698 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1699 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1700 return false;
1701
1702 unsigned NumModifiedRegs = 0;
1703
1704 if (SGPRForFPSaveRestoreCopy)
1705 NumModifiedRegs++;
1706 if (SGPRForBPSaveRestoreCopy)
1707 NumModifiedRegs++;
1708
1709 for (auto &CS : CSI) {
1710 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1711 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1712 if (--NumModifiedRegs)
1713 break;
1714 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1715 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1716 if (--NumModifiedRegs)
1717 break;
1718 }
1719 }
1720
1721 return false;
1722 }
1723
allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction & MF) const1724 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1725 const MachineFunction &MF) const {
1726
1727 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1728 const MachineFrameInfo &MFI = MF.getFrameInfo();
1729 const SIInstrInfo *TII = ST.getInstrInfo();
1730 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1731 uint64_t MaxOffset = EstStackSize - 1;
1732
1733 // We need the emergency stack slots to be allocated in range of the
1734 // MUBUF/flat scratch immediate offset from the base register, so assign these
1735 // first at the incoming SP position.
1736 //
1737 // TODO: We could try sorting the objects to find a hole in the first bytes
1738 // rather than allocating as close to possible. This could save a lot of space
1739 // on frames with alignment requirements.
1740 if (ST.enableFlatScratch()) {
1741 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1742 SIInstrFlags::FlatScratch))
1743 return false;
1744 } else {
1745 if (TII->isLegalMUBUFImmOffset(MaxOffset))
1746 return false;
1747 }
1748
1749 return true;
1750 }
1751
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const1752 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1753 MachineFunction &MF,
1754 MachineBasicBlock &MBB,
1755 MachineBasicBlock::iterator I) const {
1756 int64_t Amount = I->getOperand(0).getImm();
1757 if (Amount == 0)
1758 return MBB.erase(I);
1759
1760 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1761 const SIInstrInfo *TII = ST.getInstrInfo();
1762 const DebugLoc &DL = I->getDebugLoc();
1763 unsigned Opc = I->getOpcode();
1764 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1765 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1766
1767 if (!hasReservedCallFrame(MF)) {
1768 Amount = alignTo(Amount, getStackAlign());
1769 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1770 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1771 Register SPReg = MFI->getStackPtrOffsetReg();
1772
1773 Amount *= getScratchScaleFactor(ST);
1774 if (IsDestroy)
1775 Amount = -Amount;
1776 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1777 .addReg(SPReg)
1778 .addImm(Amount);
1779 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1780 } else if (CalleePopAmount != 0) {
1781 llvm_unreachable("is this used?");
1782 }
1783
1784 return MBB.erase(I);
1785 }
1786
1787 /// Returns true if the frame will require a reference to the stack pointer.
1788 ///
1789 /// This is the set of conditions common to setting up the stack pointer in a
1790 /// kernel, and for using a frame pointer in a callable function.
1791 ///
1792 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1793 /// references SP.
frameTriviallyRequiresSP(const MachineFrameInfo & MFI)1794 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1795 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1796 }
1797
1798 // The FP for kernels is always known 0, so we never really need to setup an
1799 // explicit register for it. However, DisableFramePointerElim will force us to
1800 // use a register for it.
hasFP(const MachineFunction & MF) const1801 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1802 const MachineFrameInfo &MFI = MF.getFrameInfo();
1803
1804 // For entry & chain functions we can use an immediate offset in most cases,
1805 // so the presence of calls doesn't imply we need a distinct frame pointer.
1806 if (MFI.hasCalls() &&
1807 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1808 !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
1809 // All offsets are unsigned, so need to be addressed in the same direction
1810 // as stack growth.
1811
1812 // FIXME: This function is pretty broken, since it can be called before the
1813 // frame layout is determined or CSR spills are inserted.
1814 return MFI.getStackSize() != 0;
1815 }
1816
1817 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1818 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1819 MF) ||
1820 MF.getTarget().Options.DisableFramePointerElim(MF);
1821 }
1822
1823 // This is essentially a reduced version of hasFP for entry functions. Since the
1824 // stack pointer is known 0 on entry to kernels, we never really need an FP
1825 // register. We may need to initialize the stack pointer depending on the frame
1826 // properties, which logically overlaps many of the cases where an ordinary
1827 // function would require an FP.
1828 // Also used for chain functions. While not technically entry functions, chain
1829 // functions may need to set up a stack pointer in some situations.
requiresStackPointerReference(const MachineFunction & MF) const1830 bool SIFrameLowering::requiresStackPointerReference(
1831 const MachineFunction &MF) const {
1832 // Callable functions always require a stack pointer reference.
1833 assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
1834 MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
1835 "only expected to call this for entry points and chain functions");
1836
1837 const MachineFrameInfo &MFI = MF.getFrameInfo();
1838
1839 // Entry points ordinarily don't need to initialize SP. We have to set it up
1840 // for callees if there are any. Also note tail calls are impossible/don't
1841 // make any sense for kernels.
1842 if (MFI.hasCalls())
1843 return true;
1844
1845 // We still need to initialize the SP if we're doing anything weird that
1846 // references the SP, like variable sized stack objects.
1847 return frameTriviallyRequiresSP(MFI);
1848 }
1849