1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8
9 #include "SIFrameLowering.h"
10 #include "AMDGPU.h"
11 #include "GCNSubtarget.h"
12 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
13 #include "SIMachineFunctionInfo.h"
14 #include "llvm/CodeGen/LivePhysRegs.h"
15 #include "llvm/CodeGen/MachineFrameInfo.h"
16 #include "llvm/CodeGen/RegisterScavenging.h"
17 #include "llvm/Target/TargetMachine.h"
18
19 using namespace llvm;
20
21 #define DEBUG_TYPE "frame-info"
22
23 static cl::opt<bool> EnableSpillVGPRToAGPR(
24 "amdgpu-spill-vgpr-to-agpr",
25 cl::desc("Enable spilling VGPRs to AGPRs"),
26 cl::ReallyHidden,
27 cl::init(true));
28
29 // Find a register matching \p RC from \p LiveRegs which is unused and available
30 // throughout the function. On failure, returns AMDGPU::NoRegister.
findUnusedRegister(MachineRegisterInfo & MRI,const LivePhysRegs & LiveRegs,const TargetRegisterClass & RC)31 static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
32 const LivePhysRegs &LiveRegs,
33 const TargetRegisterClass &RC) {
34 for (MCRegister Reg : RC) {
35 if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
36 return Reg;
37 }
38 return MCRegister();
39 }
40
41 // Find a scratch register that we can use in the prologue. We avoid using
42 // callee-save registers since they may appear to be free when this is called
43 // from canUseAsPrologue (during shrink wrapping), but then no longer be free
44 // when this is called from emitPrologue.
findScratchNonCalleeSaveRegister(MachineRegisterInfo & MRI,LivePhysRegs & LiveRegs,const TargetRegisterClass & RC,bool Unused=false)45 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
46 LivePhysRegs &LiveRegs,
47 const TargetRegisterClass &RC,
48 bool Unused = false) {
49 // Mark callee saved registers as used so we will not choose them.
50 const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
51 for (unsigned i = 0; CSRegs[i]; ++i)
52 LiveRegs.addReg(CSRegs[i]);
53
54 // We are looking for a register that can be used throughout the entire
55 // function, so any use is unacceptable.
56 if (Unused)
57 return findUnusedRegister(MRI, LiveRegs, RC);
58
59 for (MCRegister Reg : RC) {
60 if (LiveRegs.available(MRI, Reg))
61 return Reg;
62 }
63
64 return MCRegister();
65 }
66
getVGPRSpillLaneOrTempRegister(MachineFunction & MF,LivePhysRegs & LiveRegs,Register SGPR,const TargetRegisterClass & RC=AMDGPU::SReg_32_XM0_XEXECRegClass)67 static void getVGPRSpillLaneOrTempRegister(
68 MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
69 const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) {
70 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
71 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
72
73 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
74 const SIRegisterInfo *TRI = ST.getRegisterInfo();
75 unsigned Size = TRI->getSpillSize(RC);
76 Align Alignment = TRI->getSpillAlign(RC);
77
78 // We need to save and restore the given SGPR.
79
80 // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
81 // should have all the callee saved registers marked as used.
82 Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
83
84 if (!ScratchSGPR) {
85 int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
86 TargetStackID::SGPRSpill);
87
88 if (TRI->spillSGPRToVGPR() &&
89 MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
90 // 2: There's no free lane to spill, and no free register to save the
91 // SGPR, so we're forced to take another VGPR to use for the spill.
92 MFI->addToPrologEpilogSGPRSpills(
93 SGPR, PrologEpilogSGPRSaveRestoreInfo(
94 SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
95
96 LLVM_DEBUG(
97 auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
98 dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
99 << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
100 } else {
101 // Remove dead <FI> index
102 MF.getFrameInfo().RemoveStackObject(FI);
103 // 3: If all else fails, spill the register to memory.
104 FI = FrameInfo.CreateSpillStackObject(Size, Alignment);
105 MFI->addToPrologEpilogSGPRSpills(
106 SGPR,
107 PrologEpilogSGPRSaveRestoreInfo(SGPRSaveKind::SPILL_TO_MEM, FI));
108 LLVM_DEBUG(dbgs() << "Reserved FI " << FI << " for spilling "
109 << printReg(SGPR, TRI) << '\n');
110 }
111 } else {
112 MFI->addToPrologEpilogSGPRSpills(
113 SGPR, PrologEpilogSGPRSaveRestoreInfo(
114 SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
115 LiveRegs.addReg(ScratchSGPR);
116 LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
117 << printReg(ScratchSGPR, TRI) << '\n');
118 }
119 }
120
121 // We need to specially emit stack operations here because a different frame
122 // register is used than in the rest of the function, as getFrameRegister would
123 // use.
buildPrologSpill(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)124 static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
125 const SIMachineFunctionInfo &FuncInfo,
126 LivePhysRegs &LiveRegs, MachineFunction &MF,
127 MachineBasicBlock &MBB,
128 MachineBasicBlock::iterator I, const DebugLoc &DL,
129 Register SpillReg, int FI, Register FrameReg,
130 int64_t DwordOff = 0) {
131 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
132 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
133
134 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
135 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
136 MachineMemOperand *MMO = MF.getMachineMemOperand(
137 PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
138 FrameInfo.getObjectAlign(FI));
139 LiveRegs.addReg(SpillReg);
140 bool IsKill = !MBB.isLiveIn(SpillReg);
141 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
142 DwordOff, MMO, nullptr, &LiveRegs);
143 if (IsKill)
144 LiveRegs.removeReg(SpillReg);
145 }
146
buildEpilogRestore(const GCNSubtarget & ST,const SIRegisterInfo & TRI,const SIMachineFunctionInfo & FuncInfo,LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register SpillReg,int FI,Register FrameReg,int64_t DwordOff=0)147 static void buildEpilogRestore(const GCNSubtarget &ST,
148 const SIRegisterInfo &TRI,
149 const SIMachineFunctionInfo &FuncInfo,
150 LivePhysRegs &LiveRegs, MachineFunction &MF,
151 MachineBasicBlock &MBB,
152 MachineBasicBlock::iterator I,
153 const DebugLoc &DL, Register SpillReg, int FI,
154 Register FrameReg, int64_t DwordOff = 0) {
155 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
156 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
157
158 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
159 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
160 MachineMemOperand *MMO = MF.getMachineMemOperand(
161 PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
162 FrameInfo.getObjectAlign(FI));
163 TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
164 DwordOff, MMO, nullptr, &LiveRegs);
165 }
166
buildGitPtr(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,const SIInstrInfo * TII,Register TargetReg)167 static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
168 const DebugLoc &DL, const SIInstrInfo *TII,
169 Register TargetReg) {
170 MachineFunction *MF = MBB.getParent();
171 const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
172 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
173 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
174 Register TargetLo = TRI->getSubReg(TargetReg, AMDGPU::sub0);
175 Register TargetHi = TRI->getSubReg(TargetReg, AMDGPU::sub1);
176
177 if (MFI->getGITPtrHigh() != 0xffffffff) {
178 BuildMI(MBB, I, DL, SMovB32, TargetHi)
179 .addImm(MFI->getGITPtrHigh())
180 .addReg(TargetReg, RegState::ImplicitDefine);
181 } else {
182 const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
183 BuildMI(MBB, I, DL, GetPC64, TargetReg);
184 }
185 Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
186 MF->getRegInfo().addLiveIn(GitPtrLo);
187 MBB.addLiveIn(GitPtrLo);
188 BuildMI(MBB, I, DL, SMovB32, TargetLo)
189 .addReg(GitPtrLo);
190 }
191
initLiveRegs(LivePhysRegs & LiveRegs,const SIRegisterInfo & TRI,const SIMachineFunctionInfo * FuncInfo,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,bool IsProlog)192 static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
193 const SIMachineFunctionInfo *FuncInfo,
194 MachineFunction &MF, MachineBasicBlock &MBB,
195 MachineBasicBlock::iterator MBBI, bool IsProlog) {
196 if (LiveRegs.empty()) {
197 LiveRegs.init(TRI);
198 if (IsProlog) {
199 LiveRegs.addLiveIns(MBB);
200 } else {
201 // In epilog.
202 LiveRegs.addLiveOuts(MBB);
203 LiveRegs.stepBackward(*MBBI);
204 }
205 }
206 }
207
208 namespace llvm {
209
210 // SpillBuilder to save/restore special SGPR spills like the one needed for FP,
211 // BP, etc. These spills are delayed until the current function's frame is
212 // finalized. For a given register, the builder uses the
213 // PrologEpilogSGPRSaveRestoreInfo to decide the spill method.
214 class PrologEpilogSGPRSpillBuilder {
215 MachineBasicBlock::iterator MI;
216 MachineBasicBlock &MBB;
217 MachineFunction &MF;
218 const GCNSubtarget &ST;
219 MachineFrameInfo &MFI;
220 SIMachineFunctionInfo *FuncInfo;
221 const SIInstrInfo *TII;
222 const SIRegisterInfo &TRI;
223 Register SuperReg;
224 const PrologEpilogSGPRSaveRestoreInfo SI;
225 LivePhysRegs &LiveRegs;
226 const DebugLoc &DL;
227 Register FrameReg;
228 ArrayRef<int16_t> SplitParts;
229 unsigned NumSubRegs;
230 unsigned EltSize = 4;
231
saveToMemory(const int FI) const232 void saveToMemory(const int FI) const {
233 MachineRegisterInfo &MRI = MF.getRegInfo();
234 assert(!MFI.isDeadObjectIndex(FI));
235
236 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
237
238 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
239 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
240 if (!TmpVGPR)
241 report_fatal_error("failed to find free scratch register");
242
243 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
244 Register SubReg = NumSubRegs == 1
245 ? SuperReg
246 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
247 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
248 .addReg(SubReg);
249
250 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
251 FI, FrameReg, DwordOff);
252 DwordOff += 4;
253 }
254 }
255
saveToVGPRLane(const int FI) const256 void saveToVGPRLane(const int FI) const {
257 assert(!MFI.isDeadObjectIndex(FI));
258
259 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
260 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
261 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
262 assert(Spill.size() == NumSubRegs);
263
264 for (unsigned I = 0; I < NumSubRegs; ++I) {
265 Register SubReg = NumSubRegs == 1
266 ? SuperReg
267 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
268 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
269 .addReg(SubReg)
270 .addImm(Spill[I].Lane)
271 .addReg(Spill[I].VGPR, RegState::Undef);
272 }
273 }
274
copyToScratchSGPR(Register DstReg) const275 void copyToScratchSGPR(Register DstReg) const {
276 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), DstReg)
277 .addReg(SuperReg)
278 .setMIFlag(MachineInstr::FrameSetup);
279 }
280
restoreFromMemory(const int FI)281 void restoreFromMemory(const int FI) {
282 MachineRegisterInfo &MRI = MF.getRegInfo();
283
284 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
285 MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
286 MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
287 if (!TmpVGPR)
288 report_fatal_error("failed to find free scratch register");
289
290 for (unsigned I = 0, DwordOff = 0; I < NumSubRegs; ++I) {
291 Register SubReg = NumSubRegs == 1
292 ? SuperReg
293 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
294
295 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
296 FI, FrameReg, DwordOff);
297 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
298 .addReg(TmpVGPR, RegState::Kill);
299 DwordOff += 4;
300 }
301 }
302
restoreFromVGPRLane(const int FI)303 void restoreFromVGPRLane(const int FI) {
304 assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
305 ArrayRef<SIRegisterInfo::SpilledReg> Spill =
306 FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
307 assert(Spill.size() == NumSubRegs);
308
309 for (unsigned I = 0; I < NumSubRegs; ++I) {
310 Register SubReg = NumSubRegs == 1
311 ? SuperReg
312 : Register(TRI.getSubReg(SuperReg, SplitParts[I]));
313 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
314 .addReg(Spill[I].VGPR)
315 .addImm(Spill[I].Lane);
316 }
317 }
318
copyFromScratchSGPR(Register SrcReg) const319 void copyFromScratchSGPR(Register SrcReg) const {
320 BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), SuperReg)
321 .addReg(SrcReg)
322 .setMIFlag(MachineInstr::FrameDestroy);
323 }
324
325 public:
PrologEpilogSGPRSpillBuilder(Register Reg,const PrologEpilogSGPRSaveRestoreInfo SI,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,const SIInstrInfo * TII,const SIRegisterInfo & TRI,LivePhysRegs & LiveRegs,Register FrameReg)326 PrologEpilogSGPRSpillBuilder(Register Reg,
327 const PrologEpilogSGPRSaveRestoreInfo SI,
328 MachineBasicBlock &MBB,
329 MachineBasicBlock::iterator MI,
330 const DebugLoc &DL, const SIInstrInfo *TII,
331 const SIRegisterInfo &TRI,
332 LivePhysRegs &LiveRegs, Register FrameReg)
333 : MI(MI), MBB(MBB), MF(*MBB.getParent()),
334 ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
335 FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
336 SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
337 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
338 SplitParts = TRI.getRegSplitParts(RC, EltSize);
339 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
340
341 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
342 }
343
save()344 void save() {
345 switch (SI.getKind()) {
346 case SGPRSaveKind::SPILL_TO_MEM:
347 return saveToMemory(SI.getIndex());
348 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
349 return saveToVGPRLane(SI.getIndex());
350 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
351 return copyToScratchSGPR(SI.getReg());
352 }
353 }
354
restore()355 void restore() {
356 switch (SI.getKind()) {
357 case SGPRSaveKind::SPILL_TO_MEM:
358 return restoreFromMemory(SI.getIndex());
359 case SGPRSaveKind::SPILL_TO_VGPR_LANE:
360 return restoreFromVGPRLane(SI.getIndex());
361 case SGPRSaveKind::COPY_TO_SCRATCH_SGPR:
362 return copyFromScratchSGPR(SI.getReg());
363 }
364 }
365 };
366
367 } // namespace llvm
368
369 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
emitEntryFunctionFlatScratchInit(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register ScratchWaveOffsetReg) const370 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
371 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
372 const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
373 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
374 const SIInstrInfo *TII = ST.getInstrInfo();
375 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
376 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
377
378 // We don't need this if we only have spills since there is no user facing
379 // scratch.
380
381 // TODO: If we know we don't have flat instructions earlier, we can omit
382 // this from the input registers.
383 //
384 // TODO: We only need to know if we access scratch space through a flat
385 // pointer. Because we only detect if flat instructions are used at all,
386 // this will be used more often than necessary on VI.
387
388 Register FlatScrInitLo;
389 Register FlatScrInitHi;
390
391 if (ST.isAmdPalOS()) {
392 // Extract the scratch offset from the descriptor in the GIT
393 LivePhysRegs LiveRegs;
394 LiveRegs.init(*TRI);
395 LiveRegs.addLiveIns(MBB);
396
397 // Find unused reg to load flat scratch init into
398 MachineRegisterInfo &MRI = MF.getRegInfo();
399 Register FlatScrInit = AMDGPU::NoRegister;
400 ArrayRef<MCPhysReg> AllSGPR64s = TRI->getAllSGPR64(MF);
401 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 1) / 2;
402 AllSGPR64s = AllSGPR64s.slice(
403 std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
404 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
405 for (MCPhysReg Reg : AllSGPR64s) {
406 if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
407 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
408 FlatScrInit = Reg;
409 break;
410 }
411 }
412 assert(FlatScrInit && "Failed to find free register for scratch init");
413
414 FlatScrInitLo = TRI->getSubReg(FlatScrInit, AMDGPU::sub0);
415 FlatScrInitHi = TRI->getSubReg(FlatScrInit, AMDGPU::sub1);
416
417 buildGitPtr(MBB, I, DL, TII, FlatScrInit);
418
419 // We now have the GIT ptr - now get the scratch descriptor from the entry
420 // at offset 0 (or offset 16 for a compute shader).
421 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
422 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
423 auto *MMO = MF.getMachineMemOperand(
424 PtrInfo,
425 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
426 MachineMemOperand::MODereferenceable,
427 8, Align(4));
428 unsigned Offset =
429 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
430 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
431 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
432 BuildMI(MBB, I, DL, LoadDwordX2, FlatScrInit)
433 .addReg(FlatScrInit)
434 .addImm(EncodedOffset) // offset
435 .addImm(0) // cpol
436 .addMemOperand(MMO);
437
438 // Mask the offset in [47:0] of the descriptor
439 const MCInstrDesc &SAndB32 = TII->get(AMDGPU::S_AND_B32);
440 auto And = BuildMI(MBB, I, DL, SAndB32, FlatScrInitHi)
441 .addReg(FlatScrInitHi)
442 .addImm(0xffff);
443 And->getOperand(3).setIsDead(); // Mark SCC as dead.
444 } else {
445 Register FlatScratchInitReg =
446 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
447 assert(FlatScratchInitReg);
448
449 MachineRegisterInfo &MRI = MF.getRegInfo();
450 MRI.addLiveIn(FlatScratchInitReg);
451 MBB.addLiveIn(FlatScratchInitReg);
452
453 FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
454 FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
455 }
456
457 // Do a 64-bit pointer add.
458 if (ST.flatScratchIsPointer()) {
459 if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
460 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
461 .addReg(FlatScrInitLo)
462 .addReg(ScratchWaveOffsetReg);
463 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
464 FlatScrInitHi)
465 .addReg(FlatScrInitHi)
466 .addImm(0);
467 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
468
469 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
470 addReg(FlatScrInitLo).
471 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
472 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
473 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
474 addReg(FlatScrInitHi).
475 addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
476 (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
477 return;
478 }
479
480 // For GFX9.
481 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
482 .addReg(FlatScrInitLo)
483 .addReg(ScratchWaveOffsetReg);
484 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32),
485 AMDGPU::FLAT_SCR_HI)
486 .addReg(FlatScrInitHi)
487 .addImm(0);
488 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
489
490 return;
491 }
492
493 assert(ST.getGeneration() < AMDGPUSubtarget::GFX9);
494
495 // Copy the size in bytes.
496 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
497 .addReg(FlatScrInitHi, RegState::Kill);
498
499 // Add wave offset in bytes to private base offset.
500 // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
501 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), FlatScrInitLo)
502 .addReg(FlatScrInitLo)
503 .addReg(ScratchWaveOffsetReg);
504
505 // Convert offset to 256-byte units.
506 auto LShr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32),
507 AMDGPU::FLAT_SCR_HI)
508 .addReg(FlatScrInitLo, RegState::Kill)
509 .addImm(8);
510 LShr->getOperand(3).setIsDead(); // Mark SCC as dead.
511 }
512
513 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
514 // memory. They should have been removed by now.
allStackObjectsAreDead(const MachineFrameInfo & MFI)515 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
516 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
517 I != E; ++I) {
518 if (!MFI.isDeadObjectIndex(I))
519 return false;
520 }
521
522 return true;
523 }
524
525 // Shift down registers reserved for the scratch RSRC.
getEntryFunctionReservedScratchRsrcReg(MachineFunction & MF) const526 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
527 MachineFunction &MF) const {
528
529 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
530 const SIInstrInfo *TII = ST.getInstrInfo();
531 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
532 MachineRegisterInfo &MRI = MF.getRegInfo();
533 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
534
535 assert(MFI->isEntryFunction());
536
537 Register ScratchRsrcReg = MFI->getScratchRSrcReg();
538
539 if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
540 allStackObjectsAreDead(MF.getFrameInfo())))
541 return Register();
542
543 if (ST.hasSGPRInitBug() ||
544 ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
545 return ScratchRsrcReg;
546
547 // We reserved the last registers for this. Shift it down to the end of those
548 // which were actually used.
549 //
550 // FIXME: It might be safer to use a pseudoregister before replacement.
551
552 // FIXME: We should be able to eliminate unused input registers. We only
553 // cannot do this for the resources required for scratch access. For now we
554 // skip over user SGPRs and may leave unused holes.
555
556 unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
557 ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
558 AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
559
560 // Skip the last N reserved elements because they should have already been
561 // reserved for VCC etc.
562 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
563 for (MCPhysReg Reg : AllSGPR128s) {
564 // Pick the first unallocated one. Make sure we don't clobber the other
565 // reserved input we needed. Also for PAL, make sure we don't clobber
566 // the GIT pointer passed in SGPR0 or SGPR8.
567 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
568 !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
569 MRI.replaceRegWith(ScratchRsrcReg, Reg);
570 MFI->setScratchRSrcReg(Reg);
571 return Reg;
572 }
573 }
574
575 return ScratchRsrcReg;
576 }
577
getScratchScaleFactor(const GCNSubtarget & ST)578 static unsigned getScratchScaleFactor(const GCNSubtarget &ST) {
579 return ST.enableFlatScratch() ? 1 : ST.getWavefrontSize();
580 }
581
emitEntryFunctionPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const582 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
583 MachineBasicBlock &MBB) const {
584 assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
585
586 // FIXME: If we only have SGPR spills, we won't actually be using scratch
587 // memory since these spill to VGPRs. We should be cleaning up these unused
588 // SGPR spill frame indices somewhere.
589
590 // FIXME: We still have implicit uses on SGPR spill instructions in case they
591 // need to spill to vector memory. It's likely that will not happen, but at
592 // this point it appears we need the setup. This part of the prolog should be
593 // emitted after frame indices are eliminated.
594
595 // FIXME: Remove all of the isPhysRegUsed checks
596
597 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
598 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
599 const SIInstrInfo *TII = ST.getInstrInfo();
600 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
601 MachineRegisterInfo &MRI = MF.getRegInfo();
602 const Function &F = MF.getFunction();
603 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
604
605 assert(MFI->isEntryFunction());
606
607 Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
608 AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
609
610 // We need to do the replacement of the private segment buffer register even
611 // if there are no stack objects. There could be stores to undef or a
612 // constant without an associated object.
613 //
614 // This will return `Register()` in cases where there are no actual
615 // uses of the SRSRC.
616 Register ScratchRsrcReg;
617 if (!ST.enableFlatScratch())
618 ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
619
620 // Make the selected register live throughout the function.
621 if (ScratchRsrcReg) {
622 for (MachineBasicBlock &OtherBB : MF) {
623 if (&OtherBB != &MBB) {
624 OtherBB.addLiveIn(ScratchRsrcReg);
625 }
626 }
627 }
628
629 // Now that we have fixed the reserved SRSRC we need to locate the
630 // (potentially) preloaded SRSRC.
631 Register PreloadedScratchRsrcReg;
632 if (ST.isAmdHsaOrMesa(F)) {
633 PreloadedScratchRsrcReg =
634 MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
635 if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
636 // We added live-ins during argument lowering, but since they were not
637 // used they were deleted. We're adding the uses now, so add them back.
638 MRI.addLiveIn(PreloadedScratchRsrcReg);
639 MBB.addLiveIn(PreloadedScratchRsrcReg);
640 }
641 }
642
643 // Debug location must be unknown since the first debug location is used to
644 // determine the end of the prologue.
645 DebugLoc DL;
646 MachineBasicBlock::iterator I = MBB.begin();
647
648 // We found the SRSRC first because it needs four registers and has an
649 // alignment requirement. If the SRSRC that we found is clobbering with
650 // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
651 // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
652 // wave offset to a free SGPR.
653 Register ScratchWaveOffsetReg;
654 if (PreloadedScratchWaveOffsetReg &&
655 TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
656 ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
657 unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
658 AllSGPRs = AllSGPRs.slice(
659 std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
660 Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
661 for (MCPhysReg Reg : AllSGPRs) {
662 if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
663 !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
664 ScratchWaveOffsetReg = Reg;
665 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
666 .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
667 break;
668 }
669 }
670 } else {
671 ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
672 }
673 assert(ScratchWaveOffsetReg || !PreloadedScratchWaveOffsetReg);
674
675 if (requiresStackPointerReference(MF)) {
676 Register SPReg = MFI->getStackPtrOffsetReg();
677 assert(SPReg != AMDGPU::SP_REG);
678 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
679 .addImm(FrameInfo.getStackSize() * getScratchScaleFactor(ST));
680 }
681
682 if (hasFP(MF)) {
683 Register FPReg = MFI->getFrameOffsetReg();
684 assert(FPReg != AMDGPU::FP_REG);
685 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
686 }
687
688 bool NeedsFlatScratchInit =
689 MFI->hasFlatScratchInit() &&
690 (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
691 (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
692
693 if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
694 PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
695 MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
696 MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
697 }
698
699 if (NeedsFlatScratchInit) {
700 emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
701 }
702
703 if (ScratchRsrcReg) {
704 emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
705 PreloadedScratchRsrcReg,
706 ScratchRsrcReg, ScratchWaveOffsetReg);
707 }
708 }
709
710 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
emitEntryFunctionScratchRsrcRegSetup(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I,const DebugLoc & DL,Register PreloadedScratchRsrcReg,Register ScratchRsrcReg,Register ScratchWaveOffsetReg) const711 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
712 MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
713 const DebugLoc &DL, Register PreloadedScratchRsrcReg,
714 Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
715
716 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
717 const SIInstrInfo *TII = ST.getInstrInfo();
718 const SIRegisterInfo *TRI = &TII->getRegisterInfo();
719 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
720 const Function &Fn = MF.getFunction();
721
722 if (ST.isAmdPalOS()) {
723 // The pointer to the GIT is formed from the offset passed in and either
724 // the amdgpu-git-ptr-high function attribute or the top part of the PC
725 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
726 Register Rsrc03 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
727
728 buildGitPtr(MBB, I, DL, TII, Rsrc01);
729
730 // We now have the GIT ptr - now get the scratch descriptor from the entry
731 // at offset 0 (or offset 16 for a compute shader).
732 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
733 const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
734 auto MMO = MF.getMachineMemOperand(PtrInfo,
735 MachineMemOperand::MOLoad |
736 MachineMemOperand::MOInvariant |
737 MachineMemOperand::MODereferenceable,
738 16, Align(4));
739 unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
740 const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
741 unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
742 BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
743 .addReg(Rsrc01)
744 .addImm(EncodedOffset) // offset
745 .addImm(0) // cpol
746 .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
747 .addMemOperand(MMO);
748
749 // The driver will always set the SRD for wave 64 (bits 118:117 of
750 // descriptor / bits 22:21 of third sub-reg will be 0b11)
751 // If the shader is actually wave32 we have to modify the const_index_stride
752 // field of the descriptor 3rd sub-reg (bits 22:21) to 0b10 (stride=32). The
753 // reason the driver does this is that there can be cases where it presents
754 // 2 shaders with different wave size (e.g. VsFs).
755 // TODO: convert to using SCRATCH instructions or multiple SRD buffers
756 if (ST.isWave32()) {
757 const MCInstrDesc &SBitsetB32 = TII->get(AMDGPU::S_BITSET0_B32);
758 BuildMI(MBB, I, DL, SBitsetB32, Rsrc03)
759 .addImm(21)
760 .addReg(Rsrc03);
761 }
762 } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
763 assert(!ST.isAmdHsaOrMesa(Fn));
764 const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
765
766 Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
767 Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
768
769 // Use relocations to get the pointer, and setup the other bits manually.
770 uint64_t Rsrc23 = TII->getScratchRsrcWords23();
771
772 if (MFI->hasImplicitBufferPtr()) {
773 Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
774
775 if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
776 const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
777
778 BuildMI(MBB, I, DL, Mov64, Rsrc01)
779 .addReg(MFI->getImplicitBufferPtrUserSGPR())
780 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
781 } else {
782 const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
783
784 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
785 auto MMO = MF.getMachineMemOperand(
786 PtrInfo,
787 MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
788 MachineMemOperand::MODereferenceable,
789 8, Align(4));
790 BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
791 .addReg(MFI->getImplicitBufferPtrUserSGPR())
792 .addImm(0) // offset
793 .addImm(0) // cpol
794 .addMemOperand(MMO)
795 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
796
797 MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
798 MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
799 }
800 } else {
801 Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
802 Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
803
804 BuildMI(MBB, I, DL, SMovB32, Rsrc0)
805 .addExternalSymbol("SCRATCH_RSRC_DWORD0")
806 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
807
808 BuildMI(MBB, I, DL, SMovB32, Rsrc1)
809 .addExternalSymbol("SCRATCH_RSRC_DWORD1")
810 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
811
812 }
813
814 BuildMI(MBB, I, DL, SMovB32, Rsrc2)
815 .addImm(Rsrc23 & 0xffffffff)
816 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
817
818 BuildMI(MBB, I, DL, SMovB32, Rsrc3)
819 .addImm(Rsrc23 >> 32)
820 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
821 } else if (ST.isAmdHsaOrMesa(Fn)) {
822 assert(PreloadedScratchRsrcReg);
823
824 if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
825 BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
826 .addReg(PreloadedScratchRsrcReg, RegState::Kill);
827 }
828 }
829
830 // Add the scratch wave offset into the scratch RSRC.
831 //
832 // We only want to update the first 48 bits, which is the base address
833 // pointer, without touching the adjacent 16 bits of flags. We know this add
834 // cannot carry-out from bit 47, otherwise the scratch allocation would be
835 // impossible to fit in the 48-bit global address space.
836 //
837 // TODO: Evaluate if it is better to just construct an SRD using the flat
838 // scratch init and some constants rather than update the one we are passed.
839 Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
840 Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
841
842 // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
843 // the kernel body via inreg arguments.
844 BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
845 .addReg(ScratchRsrcSub0)
846 .addReg(ScratchWaveOffsetReg)
847 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
848 auto Addc = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
849 .addReg(ScratchRsrcSub1)
850 .addImm(0)
851 .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
852 Addc->getOperand(3).setIsDead(); // Mark SCC as dead.
853 }
854
isSupportedStackID(TargetStackID::Value ID) const855 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
856 switch (ID) {
857 case TargetStackID::Default:
858 case TargetStackID::NoAlloc:
859 case TargetStackID::SGPRSpill:
860 return true;
861 case TargetStackID::ScalableVector:
862 case TargetStackID::WasmLocal:
863 return false;
864 }
865 llvm_unreachable("Invalid TargetStackID::Value");
866 }
867
868 // Activate only the inactive lanes when \p EnableInactiveLanes is true.
869 // Otherwise, activate all lanes. It returns the saved exec.
buildScratchExecCopy(LivePhysRegs & LiveRegs,MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,const DebugLoc & DL,bool IsProlog,bool EnableInactiveLanes)870 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
871 MachineFunction &MF,
872 MachineBasicBlock &MBB,
873 MachineBasicBlock::iterator MBBI,
874 const DebugLoc &DL, bool IsProlog,
875 bool EnableInactiveLanes) {
876 Register ScratchExecCopy;
877 MachineRegisterInfo &MRI = MF.getRegInfo();
878 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
879 const SIInstrInfo *TII = ST.getInstrInfo();
880 const SIRegisterInfo &TRI = TII->getRegisterInfo();
881 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
882
883 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
884
885 ScratchExecCopy = findScratchNonCalleeSaveRegister(
886 MRI, LiveRegs, *TRI.getWaveMaskRegClass());
887 if (!ScratchExecCopy)
888 report_fatal_error("failed to find free scratch register");
889
890 LiveRegs.addReg(ScratchExecCopy);
891
892 const unsigned SaveExecOpc =
893 ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
894 : AMDGPU::S_OR_SAVEEXEC_B32)
895 : (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B64
896 : AMDGPU::S_OR_SAVEEXEC_B64);
897 auto SaveExec =
898 BuildMI(MBB, MBBI, DL, TII->get(SaveExecOpc), ScratchExecCopy).addImm(-1);
899 SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
900
901 return ScratchExecCopy;
902 }
903
emitCSRSpillStores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LivePhysRegs & LiveRegs,Register FrameReg,Register FramePtrRegScratchCopy) const904 void SIFrameLowering::emitCSRSpillStores(
905 MachineFunction &MF, MachineBasicBlock &MBB,
906 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
907 Register FrameReg, Register FramePtrRegScratchCopy) const {
908 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
909 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
910 const SIInstrInfo *TII = ST.getInstrInfo();
911 const SIRegisterInfo &TRI = TII->getRegisterInfo();
912
913 // Spill Whole-Wave Mode VGPRs. Save only the inactive lanes of the scratch
914 // registers. However, save all lanes of callee-saved VGPRs. Due to this, we
915 // might end up flipping the EXEC bits twice.
916 Register ScratchExecCopy;
917 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
918 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
919 if (!WWMScratchRegs.empty())
920 ScratchExecCopy =
921 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
922 /*IsProlog*/ true, /*EnableInactiveLanes*/ true);
923
924 auto StoreWWMRegisters =
925 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
926 for (const auto &Reg : WWMRegs) {
927 Register VGPR = Reg.first;
928 int FI = Reg.second;
929 buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
930 VGPR, FI, FrameReg);
931 }
932 };
933
934 StoreWWMRegisters(WWMScratchRegs);
935 if (!WWMCalleeSavedRegs.empty()) {
936 if (ScratchExecCopy) {
937 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
938 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
939 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
940 } else {
941 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
942 /*IsProlog*/ true,
943 /*EnableInactiveLanes*/ false);
944 }
945 }
946
947 StoreWWMRegisters(WWMCalleeSavedRegs);
948 if (ScratchExecCopy) {
949 // FIXME: Split block and make terminator.
950 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
951 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
952 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
953 .addReg(ScratchExecCopy, RegState::Kill);
954 LiveRegs.addReg(ScratchExecCopy);
955 }
956
957 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
958
959 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
960 // Special handle FP spill:
961 // Skip if FP is saved to a scratch SGPR, the save has already been emitted.
962 // Otherwise, FP has been moved to a temporary register and spill it
963 // instead.
964 Register Reg =
965 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
966 if (!Reg)
967 continue;
968
969 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
970 LiveRegs, FrameReg);
971 SB.save();
972 }
973
974 // If a copy to scratch SGPR has been chosen for any of the SGPR spills, make
975 // such scratch registers live throughout the function.
976 SmallVector<Register, 1> ScratchSGPRs;
977 FuncInfo->getAllScratchSGPRCopyDstRegs(ScratchSGPRs);
978 if (!ScratchSGPRs.empty()) {
979 for (MachineBasicBlock &MBB : MF) {
980 for (MCPhysReg Reg : ScratchSGPRs)
981 MBB.addLiveIn(Reg);
982
983 MBB.sortUniqueLiveIns();
984 }
985 if (!LiveRegs.empty()) {
986 for (MCPhysReg Reg : ScratchSGPRs)
987 LiveRegs.addReg(Reg);
988 }
989 }
990 }
991
emitCSRSpillRestores(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator MBBI,DebugLoc & DL,LivePhysRegs & LiveRegs,Register FrameReg,Register FramePtrRegScratchCopy) const992 void SIFrameLowering::emitCSRSpillRestores(
993 MachineFunction &MF, MachineBasicBlock &MBB,
994 MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
995 Register FrameReg, Register FramePtrRegScratchCopy) const {
996 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
997 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
998 const SIInstrInfo *TII = ST.getInstrInfo();
999 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1000 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1001
1002 for (const auto &Spill : FuncInfo->getPrologEpilogSGPRSpills()) {
1003 // Special handle FP restore:
1004 // Skip if FP needs to be restored from the scratch SGPR. Otherwise, restore
1005 // the FP value to a temporary register. The frame pointer should be
1006 // overwritten only at the end when all other spills are restored from
1007 // current frame.
1008 Register Reg =
1009 Spill.first == FramePtrReg ? FramePtrRegScratchCopy : Spill.first;
1010 if (!Reg)
1011 continue;
1012
1013 PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
1014 LiveRegs, FrameReg);
1015 SB.restore();
1016 }
1017
1018 // Restore Whole-Wave Mode VGPRs. Restore only the inactive lanes of the
1019 // scratch registers. However, restore all lanes of callee-saved VGPRs. Due to
1020 // this, we might end up flipping the EXEC bits twice.
1021 Register ScratchExecCopy;
1022 SmallVector<std::pair<Register, int>, 2> WWMCalleeSavedRegs, WWMScratchRegs;
1023 FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
1024 if (!WWMScratchRegs.empty())
1025 ScratchExecCopy =
1026 buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1027 /*IsProlog*/ false, /*EnableInactiveLanes*/ true);
1028
1029 auto RestoreWWMRegisters =
1030 [&](SmallVectorImpl<std::pair<Register, int>> &WWMRegs) {
1031 for (const auto &Reg : WWMRegs) {
1032 Register VGPR = Reg.first;
1033 int FI = Reg.second;
1034 buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
1035 VGPR, FI, FrameReg);
1036 }
1037 };
1038
1039 RestoreWWMRegisters(WWMScratchRegs);
1040 if (!WWMCalleeSavedRegs.empty()) {
1041 if (ScratchExecCopy) {
1042 unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1043 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1044 BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
1045 } else {
1046 ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
1047 /*IsProlog*/ false,
1048 /*EnableInactiveLanes*/ false);
1049 }
1050 }
1051
1052 RestoreWWMRegisters(WWMCalleeSavedRegs);
1053 if (ScratchExecCopy) {
1054 // FIXME: Split block and make terminator.
1055 unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1056 MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1057 BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1058 .addReg(ScratchExecCopy, RegState::Kill);
1059 }
1060 }
1061
emitPrologue(MachineFunction & MF,MachineBasicBlock & MBB) const1062 void SIFrameLowering::emitPrologue(MachineFunction &MF,
1063 MachineBasicBlock &MBB) const {
1064 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1065 if (FuncInfo->isEntryFunction()) {
1066 emitEntryFunctionPrologue(MF, MBB);
1067 return;
1068 }
1069
1070 MachineFrameInfo &MFI = MF.getFrameInfo();
1071 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1072 const SIInstrInfo *TII = ST.getInstrInfo();
1073 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1074 MachineRegisterInfo &MRI = MF.getRegInfo();
1075
1076 Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1077 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1078 Register BasePtrReg =
1079 TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
1080 LivePhysRegs LiveRegs;
1081
1082 MachineBasicBlock::iterator MBBI = MBB.begin();
1083 // DebugLoc must be unknown since the first instruction with DebugLoc is used
1084 // to determine the end of the prologue.
1085 DebugLoc DL;
1086
1087 bool HasFP = false;
1088 bool HasBP = false;
1089 uint32_t NumBytes = MFI.getStackSize();
1090 uint32_t RoundedSize = NumBytes;
1091
1092 if (TRI.hasStackRealignment(MF))
1093 HasFP = true;
1094
1095 Register FramePtrRegScratchCopy;
1096 if (!HasFP && !hasFP(MF)) {
1097 // Emit the CSR spill stores with SP base register.
1098 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1099 FramePtrRegScratchCopy);
1100 } else {
1101 // CSR spill stores will use FP as base register.
1102 Register SGPRForFPSaveRestoreCopy =
1103 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1104
1105 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
1106 if (SGPRForFPSaveRestoreCopy) {
1107 // Copy FP to the scratch register now and emit the CFI entry. It avoids
1108 // the extra FP copy needed in the other two cases when FP is spilled to
1109 // memory or to a VGPR lane.
1110 PrologEpilogSGPRSpillBuilder SB(
1111 FramePtrReg,
1112 FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
1113 DL, TII, TRI, LiveRegs, FramePtrReg);
1114 SB.save();
1115 LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1116 } else {
1117 // Copy FP into a new scratch register so that its previous value can be
1118 // spilled after setting up the new frame.
1119 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1120 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1121 if (!FramePtrRegScratchCopy)
1122 report_fatal_error("failed to find free scratch register");
1123
1124 LiveRegs.addReg(FramePtrRegScratchCopy);
1125 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
1126 .addReg(FramePtrReg);
1127 }
1128 }
1129
1130 if (HasFP) {
1131 const unsigned Alignment = MFI.getMaxAlign().value();
1132
1133 RoundedSize += Alignment;
1134 if (LiveRegs.empty()) {
1135 LiveRegs.init(TRI);
1136 LiveRegs.addLiveIns(MBB);
1137 }
1138
1139 // s_add_i32 s33, s32, NumBytes
1140 // s_and_b32 s33, s33, 0b111...0000
1141 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), FramePtrReg)
1142 .addReg(StackPtrReg)
1143 .addImm((Alignment - 1) * getScratchScaleFactor(ST))
1144 .setMIFlag(MachineInstr::FrameSetup);
1145 auto And = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
1146 .addReg(FramePtrReg, RegState::Kill)
1147 .addImm(-Alignment * getScratchScaleFactor(ST))
1148 .setMIFlag(MachineInstr::FrameSetup);
1149 And->getOperand(3).setIsDead(); // Mark SCC as dead.
1150 FuncInfo->setIsStackRealigned(true);
1151 } else if ((HasFP = hasFP(MF))) {
1152 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1153 .addReg(StackPtrReg)
1154 .setMIFlag(MachineInstr::FrameSetup);
1155 }
1156
1157 // If FP is used, emit the CSR spills with FP base register.
1158 if (HasFP) {
1159 emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1160 FramePtrRegScratchCopy);
1161 if (FramePtrRegScratchCopy)
1162 LiveRegs.removeReg(FramePtrRegScratchCopy);
1163 }
1164
1165 // If we need a base pointer, set it up here. It's whatever the value of
1166 // the stack pointer is at this point. Any variable size objects will be
1167 // allocated after this, so we can still use the base pointer to reference
1168 // the incoming arguments.
1169 if ((HasBP = TRI.hasBasePointer(MF))) {
1170 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
1171 .addReg(StackPtrReg)
1172 .setMIFlag(MachineInstr::FrameSetup);
1173 }
1174
1175 if (HasFP && RoundedSize != 0) {
1176 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1177 .addReg(StackPtrReg)
1178 .addImm(RoundedSize * getScratchScaleFactor(ST))
1179 .setMIFlag(MachineInstr::FrameSetup);
1180 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1181 }
1182
1183 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1184 (void)FPSaved;
1185 assert((!HasFP || FPSaved) &&
1186 "Needed to save FP but didn't save it anywhere");
1187
1188 // If we allow spilling to AGPRs we may have saved FP but then spill
1189 // everything into AGPRs instead of the stack.
1190 assert((HasFP || !FPSaved || EnableSpillVGPRToAGPR) &&
1191 "Saved FP but didn't need it");
1192
1193 bool BPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(BasePtrReg);
1194 (void)BPSaved;
1195 assert((!HasBP || BPSaved) &&
1196 "Needed to save BP but didn't save it anywhere");
1197
1198 assert((HasBP || !BPSaved) && "Saved BP but didn't need it");
1199 }
1200
emitEpilogue(MachineFunction & MF,MachineBasicBlock & MBB) const1201 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
1202 MachineBasicBlock &MBB) const {
1203 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1204 if (FuncInfo->isEntryFunction())
1205 return;
1206
1207 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1208 const SIInstrInfo *TII = ST.getInstrInfo();
1209 const SIRegisterInfo &TRI = TII->getRegisterInfo();
1210 MachineRegisterInfo &MRI = MF.getRegInfo();
1211 LivePhysRegs LiveRegs;
1212 // Get the insert location for the epilogue. If there were no terminators in
1213 // the block, get the last instruction.
1214 MachineBasicBlock::iterator MBBI = MBB.end();
1215 DebugLoc DL;
1216 if (!MBB.empty()) {
1217 MBBI = MBB.getLastNonDebugInstr();
1218 if (MBBI != MBB.end())
1219 DL = MBBI->getDebugLoc();
1220
1221 MBBI = MBB.getFirstTerminator();
1222 }
1223
1224 const MachineFrameInfo &MFI = MF.getFrameInfo();
1225 uint32_t NumBytes = MFI.getStackSize();
1226 uint32_t RoundedSize = FuncInfo->isStackRealigned()
1227 ? NumBytes + MFI.getMaxAlign().value()
1228 : NumBytes;
1229 const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
1230 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1231 bool FPSaved = FuncInfo->hasPrologEpilogSGPRSpillEntry(FramePtrReg);
1232
1233 Register FramePtrRegScratchCopy;
1234 Register SGPRForFPSaveRestoreCopy =
1235 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1236 if (FPSaved) {
1237 // CSR spill restores should use FP as base register. If
1238 // SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
1239 // into a new scratch register and copy to FP later when other registers are
1240 // restored from the current stack frame.
1241 initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
1242 if (SGPRForFPSaveRestoreCopy) {
1243 LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
1244 } else {
1245 FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
1246 MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
1247 if (!FramePtrRegScratchCopy)
1248 report_fatal_error("failed to find free scratch register");
1249
1250 LiveRegs.addReg(FramePtrRegScratchCopy);
1251 }
1252
1253 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
1254 FramePtrRegScratchCopy);
1255 }
1256
1257 if (RoundedSize != 0 && hasFP(MF)) {
1258 auto Add = BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_I32), StackPtrReg)
1259 .addReg(StackPtrReg)
1260 .addImm(-static_cast<int64_t>(RoundedSize * getScratchScaleFactor(ST)))
1261 .setMIFlag(MachineInstr::FrameDestroy);
1262 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1263 }
1264
1265 if (FPSaved) {
1266 // Insert the copy to restore FP.
1267 Register SrcReg = SGPRForFPSaveRestoreCopy ? SGPRForFPSaveRestoreCopy
1268 : FramePtrRegScratchCopy;
1269 MachineInstrBuilder MIB =
1270 BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
1271 .addReg(SrcReg);
1272 if (SGPRForFPSaveRestoreCopy)
1273 MIB.setMIFlag(MachineInstr::FrameDestroy);
1274 } else {
1275 // Insert the CSR spill restores with SP as the base register.
1276 emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
1277 FramePtrRegScratchCopy);
1278 }
1279 }
1280
1281 #ifndef NDEBUG
allSGPRSpillsAreDead(const MachineFunction & MF)1282 static bool allSGPRSpillsAreDead(const MachineFunction &MF) {
1283 const MachineFrameInfo &MFI = MF.getFrameInfo();
1284 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1285 for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1286 I != E; ++I) {
1287 if (!MFI.isDeadObjectIndex(I) &&
1288 MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1289 !FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
1290 return false;
1291 }
1292 }
1293
1294 return true;
1295 }
1296 #endif
1297
getFrameIndexReference(const MachineFunction & MF,int FI,Register & FrameReg) const1298 StackOffset SIFrameLowering::getFrameIndexReference(const MachineFunction &MF,
1299 int FI,
1300 Register &FrameReg) const {
1301 const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1302
1303 FrameReg = RI->getFrameRegister(MF);
1304 return StackOffset::getFixed(MF.getFrameInfo().getObjectOffset(FI));
1305 }
1306
processFunctionBeforeFrameFinalized(MachineFunction & MF,RegScavenger * RS) const1307 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1308 MachineFunction &MF,
1309 RegScavenger *RS) const {
1310 MachineFrameInfo &MFI = MF.getFrameInfo();
1311
1312 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1313 const SIInstrInfo *TII = ST.getInstrInfo();
1314 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1315 MachineRegisterInfo &MRI = MF.getRegInfo();
1316 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1317
1318 // Allocate spill slots for WWM reserved VGPRs.
1319 if (!FuncInfo->isEntryFunction()) {
1320 for (Register Reg : FuncInfo->getWWMReservedRegs()) {
1321 const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
1322 FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
1323 TRI->getSpillAlign(*RC));
1324 }
1325 }
1326
1327 const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
1328 && EnableSpillVGPRToAGPR;
1329
1330 if (SpillVGPRToAGPR) {
1331 // To track the spill frame indices handled in this pass.
1332 BitVector SpillFIs(MFI.getObjectIndexEnd(), false);
1333 BitVector NonVGPRSpillFIs(MFI.getObjectIndexEnd(), false);
1334
1335 bool SeenDbgInstr = false;
1336
1337 for (MachineBasicBlock &MBB : MF) {
1338 for (MachineInstr &MI : llvm::make_early_inc_range(MBB)) {
1339 int FrameIndex;
1340 if (MI.isDebugInstr())
1341 SeenDbgInstr = true;
1342
1343 if (TII->isVGPRSpill(MI)) {
1344 // Try to eliminate stack used by VGPR spills before frame
1345 // finalization.
1346 unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
1347 AMDGPU::OpName::vaddr);
1348 int FI = MI.getOperand(FIOp).getIndex();
1349 Register VReg =
1350 TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
1351 if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
1352 TRI->isAGPR(MRI, VReg))) {
1353 // FIXME: change to enterBasicBlockEnd()
1354 RS->enterBasicBlock(MBB);
1355 TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
1356 SpillFIs.set(FI);
1357 continue;
1358 }
1359 } else if (TII->isStoreToStackSlot(MI, FrameIndex) ||
1360 TII->isLoadFromStackSlot(MI, FrameIndex))
1361 if (!MFI.isFixedObjectIndex(FrameIndex))
1362 NonVGPRSpillFIs.set(FrameIndex);
1363 }
1364 }
1365
1366 // Stack slot coloring may assign different objects to the same stack slot.
1367 // If not, then the VGPR to AGPR spill slot is dead.
1368 for (unsigned FI : SpillFIs.set_bits())
1369 if (!NonVGPRSpillFIs.test(FI))
1370 FuncInfo->setVGPRToAGPRSpillDead(FI);
1371
1372 for (MachineBasicBlock &MBB : MF) {
1373 for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
1374 MBB.addLiveIn(Reg);
1375
1376 for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
1377 MBB.addLiveIn(Reg);
1378
1379 MBB.sortUniqueLiveIns();
1380
1381 if (!SpillFIs.empty() && SeenDbgInstr) {
1382 // FIXME: The dead frame indices are replaced with a null register from
1383 // the debug value instructions. We should instead, update it with the
1384 // correct register value. But not sure the register value alone is
1385 for (MachineInstr &MI : MBB) {
1386 if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
1387 !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
1388 SpillFIs[MI.getOperand(0).getIndex()]) {
1389 MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
1390 }
1391 }
1392 }
1393 }
1394 }
1395
1396 // At this point we've already allocated all spilled SGPRs to VGPRs if we
1397 // can. Any remaining SGPR spills will go to memory, so move them back to the
1398 // default stack.
1399 bool HaveSGPRToVMemSpill =
1400 FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
1401 assert(allSGPRSpillsAreDead(MF) &&
1402 "SGPR spill should have been removed in SILowerSGPRSpills");
1403
1404 // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1405 // but currently hasNonSpillStackObjects is set only from source
1406 // allocas. Stack temps produced from legalization are not counted currently.
1407 if (!allStackObjectsAreDead(MFI)) {
1408 assert(RS && "RegScavenger required if spilling");
1409
1410 // Add an emergency spill slot
1411 RS->addScavengingFrameIndex(FuncInfo->getScavengeFI(MFI, *TRI));
1412
1413 // If we are spilling SGPRs to memory with a large frame, we may need a
1414 // second VGPR emergency frame index.
1415 if (HaveSGPRToVMemSpill &&
1416 allocateScavengingFrameIndexesNearIncomingSP(MF)) {
1417 RS->addScavengingFrameIndex(MFI.CreateStackObject(4, Align(4), false));
1418 }
1419 }
1420 }
1421
processFunctionBeforeFrameIndicesReplaced(MachineFunction & MF,RegScavenger * RS) const1422 void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
1423 MachineFunction &MF, RegScavenger *RS) const {
1424 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1425 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1426 MachineRegisterInfo &MRI = MF.getRegInfo();
1427 SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1428
1429 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
1430 // On gfx908, we had initially reserved highest available VGPR for AGPR
1431 // copy. Now since we are done with RA, check if there exist an unused VGPR
1432 // which is lower than the eariler reserved VGPR before RA. If one exist,
1433 // use it for AGPR copy instead of one reserved before RA.
1434 Register VGPRForAGPRCopy = FuncInfo->getVGPRForAGPRCopy();
1435 Register UnusedLowVGPR =
1436 TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
1437 if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
1438 TRI->getHWRegIndex(VGPRForAGPRCopy))) {
1439 // Call to setVGPRForAGPRCopy() should happen first before calling
1440 // freezeReservedRegs() so that getReservedRegs() can reserve this newly
1441 // identified VGPR (for AGPR copy).
1442 FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
1443 MRI.freezeReservedRegs(MF);
1444 }
1445 }
1446 }
1447
1448 // The special SGPR spills like the one needed for FP, BP or any reserved
1449 // registers delayed until frame lowering.
determinePrologEpilogSGPRSaves(MachineFunction & MF,BitVector & SavedVGPRs) const1450 void SIFrameLowering::determinePrologEpilogSGPRSaves(
1451 MachineFunction &MF, BitVector &SavedVGPRs) const {
1452 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1453 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1454 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1455 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1456 LivePhysRegs LiveRegs;
1457 LiveRegs.init(*TRI);
1458 // Initially mark callee saved registers as used so we will not choose them
1459 // while looking for scratch SGPRs.
1460 const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
1461 for (unsigned I = 0; CSRegs[I]; ++I)
1462 LiveRegs.addReg(CSRegs[I]);
1463
1464 // hasFP only knows about stack objects that already exist. We're now
1465 // determining the stack slots that will be created, so we have to predict
1466 // them. Stack objects force FP usage with calls.
1467 //
1468 // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1469 // don't want to report it here.
1470 //
1471 // FIXME: Is this really hasReservedCallFrame?
1472 const bool WillHaveFP =
1473 FrameInfo.hasCalls() &&
1474 (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1475
1476 if (WillHaveFP || hasFP(MF)) {
1477 Register FramePtrReg = MFI->getFrameOffsetReg();
1478 assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
1479 "Re-reserving spill slot for FP");
1480 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
1481 }
1482
1483 if (TRI->hasBasePointer(MF)) {
1484 Register BasePtrReg = TRI->getBaseRegister();
1485 assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
1486 "Re-reserving spill slot for BP");
1487 getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
1488 }
1489 }
1490
1491 // Only report VGPRs to generic code.
determineCalleeSaves(MachineFunction & MF,BitVector & SavedVGPRs,RegScavenger * RS) const1492 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1493 BitVector &SavedVGPRs,
1494 RegScavenger *RS) const {
1495 TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1496 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1497 if (MFI->isEntryFunction())
1498 return;
1499
1500 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1501 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1502
1503 for (MachineBasicBlock &MBB : MF) {
1504 for (MachineInstr &MI : MBB) {
1505 // WRITELANE instructions used for SGPR spills can overwrite the inactive
1506 // lanes of VGPRs and callee must spill and restore them even if they are
1507 // marked Caller-saved.
1508
1509 // TODO: Handle this elsewhere at an early point. Walking through all MBBs
1510 // here would be a bad heuristic. A better way should be by calling
1511 // allocateWWMSpill during the regalloc pipeline whenever a physical
1512 // register is allocated for the intended virtual registers. That will
1513 // also help excluding the general use of WRITELANE/READLANE intrinsics
1514 // that won't really need any such special handling.
1515 if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
1516 MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
1517 else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
1518 MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
1519 }
1520 }
1521
1522 // Ignore the SGPRs the default implementation found.
1523 SavedVGPRs.clearBitsNotInMask(TRI->getAllVectorRegMask());
1524
1525 // Do not save AGPRs prior to GFX90A because there was no easy way to do so.
1526 // In gfx908 there was do AGPR loads and stores and thus spilling also
1527 // require a temporary VGPR.
1528 if (!ST.hasGFX90AInsts())
1529 SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
1530
1531 determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
1532
1533 // The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
1534 // allow the default insertion to handle them.
1535 for (auto &Reg : MFI->getWWMSpills())
1536 SavedVGPRs.reset(Reg.first);
1537
1538 // Mark all lane VGPRs as BB LiveIns.
1539 for (MachineBasicBlock &MBB : MF) {
1540 for (auto &Reg : MFI->getWWMSpills())
1541 MBB.addLiveIn(Reg.first);
1542
1543 MBB.sortUniqueLiveIns();
1544 }
1545 }
1546
determineCalleeSavesSGPR(MachineFunction & MF,BitVector & SavedRegs,RegScavenger * RS) const1547 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1548 BitVector &SavedRegs,
1549 RegScavenger *RS) const {
1550 TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1551 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1552 if (MFI->isEntryFunction())
1553 return;
1554
1555 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1556 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1557
1558 // The SP is specifically managed and we don't want extra spills of it.
1559 SavedRegs.reset(MFI->getStackPtrOffsetReg());
1560
1561 const BitVector AllSavedRegs = SavedRegs;
1562 SavedRegs.clearBitsInMask(TRI->getAllVectorRegMask());
1563
1564 // We have to anticipate introducing CSR VGPR spills or spill of caller
1565 // save VGPR reserved for SGPR spills as we now always create stack entry
1566 // for it, if we don't have any stack objects already, since we require a FP
1567 // if there is a call and stack. We will allocate a VGPR for SGPR spills if
1568 // there are any SGPR spills. Whether they are CSR spills or otherwise.
1569 MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1570 const bool WillHaveFP =
1571 FrameInfo.hasCalls() && (AllSavedRegs.any() || MFI->hasSpilledSGPRs());
1572
1573 // FP will be specially managed like SP.
1574 if (WillHaveFP || hasFP(MF))
1575 SavedRegs.reset(MFI->getFrameOffsetReg());
1576
1577 // Return address use with return instruction is hidden through the SI_RETURN
1578 // pseudo. Given that and since the IPRA computes actual register usage and
1579 // does not use CSR list, the clobbering of return address by function calls
1580 // (D117243) or otherwise (D120922) is ignored/not seen by the IPRA's register
1581 // usage collection. This will ensure save/restore of return address happens
1582 // in those scenarios.
1583 const MachineRegisterInfo &MRI = MF.getRegInfo();
1584 Register RetAddrReg = TRI->getReturnAddressReg(MF);
1585 if (!MFI->isEntryFunction() &&
1586 (FrameInfo.hasCalls() || MRI.isPhysRegModified(RetAddrReg))) {
1587 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub0));
1588 SavedRegs.set(TRI->getSubReg(RetAddrReg, AMDGPU::sub1));
1589 }
1590 }
1591
assignCalleeSavedSpillSlots(MachineFunction & MF,const TargetRegisterInfo * TRI,std::vector<CalleeSavedInfo> & CSI) const1592 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1593 MachineFunction &MF, const TargetRegisterInfo *TRI,
1594 std::vector<CalleeSavedInfo> &CSI) const {
1595 if (CSI.empty())
1596 return true; // Early exit if no callee saved registers are modified!
1597
1598 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1599 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1600 const SIRegisterInfo *RI = ST.getRegisterInfo();
1601 Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1602 Register BasePtrReg = RI->getBaseRegister();
1603 Register SGPRForFPSaveRestoreCopy =
1604 FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
1605 Register SGPRForBPSaveRestoreCopy =
1606 FuncInfo->getScratchSGPRCopyDstReg(BasePtrReg);
1607 if (!SGPRForFPSaveRestoreCopy && !SGPRForBPSaveRestoreCopy)
1608 return false;
1609
1610 unsigned NumModifiedRegs = 0;
1611
1612 if (SGPRForFPSaveRestoreCopy)
1613 NumModifiedRegs++;
1614 if (SGPRForBPSaveRestoreCopy)
1615 NumModifiedRegs++;
1616
1617 for (auto &CS : CSI) {
1618 if (CS.getReg() == FramePtrReg && SGPRForFPSaveRestoreCopy) {
1619 CS.setDstReg(SGPRForFPSaveRestoreCopy);
1620 if (--NumModifiedRegs)
1621 break;
1622 } else if (CS.getReg() == BasePtrReg && SGPRForBPSaveRestoreCopy) {
1623 CS.setDstReg(SGPRForBPSaveRestoreCopy);
1624 if (--NumModifiedRegs)
1625 break;
1626 }
1627 }
1628
1629 return false;
1630 }
1631
allocateScavengingFrameIndexesNearIncomingSP(const MachineFunction & MF) const1632 bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
1633 const MachineFunction &MF) const {
1634
1635 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1636 const MachineFrameInfo &MFI = MF.getFrameInfo();
1637 uint64_t EstStackSize = MFI.estimateStackSize(MF);
1638 uint64_t MaxOffset = EstStackSize - 1;
1639
1640 // We need the emergency stack slots to be allocated in range of the
1641 // MUBUF/flat scratch immediate offset from the base register, so assign these
1642 // first at the incoming SP position.
1643 //
1644 // TODO: We could try sorting the objects to find a hole in the first bytes
1645 // rather than allocating as close to possible. This could save a lot of space
1646 // on frames with alignment requirements.
1647 if (ST.enableFlatScratch()) {
1648 const SIInstrInfo *TII = ST.getInstrInfo();
1649 if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1650 SIInstrFlags::FlatScratch))
1651 return false;
1652 } else {
1653 if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
1654 return false;
1655 }
1656
1657 return true;
1658 }
1659
eliminateCallFramePseudoInstr(MachineFunction & MF,MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const1660 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1661 MachineFunction &MF,
1662 MachineBasicBlock &MBB,
1663 MachineBasicBlock::iterator I) const {
1664 int64_t Amount = I->getOperand(0).getImm();
1665 if (Amount == 0)
1666 return MBB.erase(I);
1667
1668 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1669 const SIInstrInfo *TII = ST.getInstrInfo();
1670 const DebugLoc &DL = I->getDebugLoc();
1671 unsigned Opc = I->getOpcode();
1672 bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1673 uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1674
1675 if (!hasReservedCallFrame(MF)) {
1676 Amount = alignTo(Amount, getStackAlign());
1677 assert(isUInt<32>(Amount) && "exceeded stack address space size");
1678 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1679 Register SPReg = MFI->getStackPtrOffsetReg();
1680
1681 Amount *= getScratchScaleFactor(ST);
1682 if (IsDestroy)
1683 Amount = -Amount;
1684 auto Add = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SPReg)
1685 .addReg(SPReg)
1686 .addImm(Amount);
1687 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1688 } else if (CalleePopAmount != 0) {
1689 llvm_unreachable("is this used?");
1690 }
1691
1692 return MBB.erase(I);
1693 }
1694
1695 /// Returns true if the frame will require a reference to the stack pointer.
1696 ///
1697 /// This is the set of conditions common to setting up the stack pointer in a
1698 /// kernel, and for using a frame pointer in a callable function.
1699 ///
1700 /// FIXME: Should also check hasOpaqueSPAdjustment and if any inline asm
1701 /// references SP.
frameTriviallyRequiresSP(const MachineFrameInfo & MFI)1702 static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
1703 return MFI.hasVarSizedObjects() || MFI.hasStackMap() || MFI.hasPatchPoint();
1704 }
1705
1706 // The FP for kernels is always known 0, so we never really need to setup an
1707 // explicit register for it. However, DisableFramePointerElim will force us to
1708 // use a register for it.
hasFP(const MachineFunction & MF) const1709 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1710 const MachineFrameInfo &MFI = MF.getFrameInfo();
1711
1712 // For entry functions we can use an immediate offset in most cases, so the
1713 // presence of calls doesn't imply we need a distinct frame pointer.
1714 if (MFI.hasCalls() &&
1715 !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1716 // All offsets are unsigned, so need to be addressed in the same direction
1717 // as stack growth.
1718
1719 // FIXME: This function is pretty broken, since it can be called before the
1720 // frame layout is determined or CSR spills are inserted.
1721 return MFI.getStackSize() != 0;
1722 }
1723
1724 return frameTriviallyRequiresSP(MFI) || MFI.isFrameAddressTaken() ||
1725 MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->hasStackRealignment(
1726 MF) ||
1727 MF.getTarget().Options.DisableFramePointerElim(MF);
1728 }
1729
1730 // This is essentially a reduced version of hasFP for entry functions. Since the
1731 // stack pointer is known 0 on entry to kernels, we never really need an FP
1732 // register. We may need to initialize the stack pointer depending on the frame
1733 // properties, which logically overlaps many of the cases where an ordinary
1734 // function would require an FP.
requiresStackPointerReference(const MachineFunction & MF) const1735 bool SIFrameLowering::requiresStackPointerReference(
1736 const MachineFunction &MF) const {
1737 // Callable functions always require a stack pointer reference.
1738 assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
1739 "only expected to call this for entry points");
1740
1741 const MachineFrameInfo &MFI = MF.getFrameInfo();
1742
1743 // Entry points ordinarily don't need to initialize SP. We have to set it up
1744 // for callees if there are any. Also note tail calls are impossible/don't
1745 // make any sense for kernels.
1746 if (MFI.hasCalls())
1747 return true;
1748
1749 // We still need to initialize the SP if we're doing anything weird that
1750 // references the SP, like variable sized stack objects.
1751 return frameTriviallyRequiresSP(MFI);
1752 }
1753