1 //===----------------------- SIFrameLowering.cpp --------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //==-----------------------------------------------------------------------===//
8 
9 #include "SIFrameLowering.h"
10 #include "AMDGPUSubtarget.h"
11 #include "SIInstrInfo.h"
12 #include "SIMachineFunctionInfo.h"
13 #include "SIRegisterInfo.h"
14 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
15 
16 #include "llvm/CodeGen/LivePhysRegs.h"
17 #include "llvm/CodeGen/MachineFrameInfo.h"
18 #include "llvm/CodeGen/MachineFunction.h"
19 #include "llvm/CodeGen/MachineInstrBuilder.h"
20 #include "llvm/CodeGen/RegisterScavenging.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "frame-info"
25 
26 
27 // Find a scratch register that we can use at the start of the prologue to
28 // re-align the stack pointer. We avoid using callee-save registers since they
29 // may appear to be free when this is called from canUseAsPrologue (during
30 // shrink wrapping), but then no longer be free when this is called from
31 // emitPrologue.
32 //
33 // FIXME: This is a bit conservative, since in the above case we could use one
34 // of the callee-save registers as a scratch temp to re-align the stack pointer,
35 // but we would then have to make sure that we were in fact saving at least one
36 // callee-save register in the prologue, which is additional complexity that
37 // doesn't seem worth the benefit.
38 static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
39                                                    LivePhysRegs &LiveRegs,
40                                                    const TargetRegisterClass &RC,
41                                                    bool Unused = false) {
42   // Mark callee saved registers as used so we will not choose them.
43   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
44   for (unsigned i = 0; CSRegs[i]; ++i)
45     LiveRegs.addReg(CSRegs[i]);
46 
47   if (Unused) {
48     // We are looking for a register that can be used throughout the entire
49     // function, so any use is unacceptable.
50     for (MCRegister Reg : RC) {
51       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
52         return Reg;
53     }
54   } else {
55     for (MCRegister Reg : RC) {
56       if (LiveRegs.available(MRI, Reg))
57         return Reg;
58     }
59   }
60 
61   // If we require an unused register, this is used in contexts where failure is
62   // an option and has an alternative plan. In other contexts, this must
63   // succeed0.
64   if (!Unused)
65     report_fatal_error("failed to find free scratch register");
66 
67   return MCRegister();
68 }
69 
70 static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
71                                            LivePhysRegs &LiveRegs,
72                                            Register &TempSGPR,
73                                            Optional<int> &FrameIndex,
74                                            bool IsFP) {
75   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
76   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
77 
78 #ifndef NDEBUG
79   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
80   const SIRegisterInfo *TRI = ST.getRegisterInfo();
81 #endif
82 
83   // We need to save and restore the current FP/BP.
84 
85   // 1: If there is already a VGPR with free lanes, use it. We
86   // may already have to pay the penalty for spilling a CSR VGPR.
87   if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
88     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
89                                             TargetStackID::SGPRSpill);
90 
91     if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
92       llvm_unreachable("allocate SGPR spill should have worked");
93 
94     FrameIndex = NewFI;
95 
96     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
97                dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
98                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
99                       << '\n');
100     return;
101   }
102 
103   // 2: Next, try to save the FP/BP in an unused SGPR.
104   TempSGPR = findScratchNonCalleeSaveRegister(
105       MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
106 
107   if (!TempSGPR) {
108     int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
109                                             TargetStackID::SGPRSpill);
110 
111     if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
112       // 3: There's no free lane to spill, and no free register to save FP/BP,
113       // so we're forced to spill another VGPR to use for the spill.
114       FrameIndex = NewFI;
115     } else {
116       // 4: If all else fails, spill the FP/BP to memory.
117       FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
118     }
119 
120     LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
121                dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
122                       << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
123                       << '\n';);
124   } else {
125     LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
126                       << printReg(TempSGPR, TRI) << '\n');
127   }
128 }
129 
130 // We need to specially emit stack operations here because a different frame
131 // register is used than in the rest of the function, as getFrameRegister would
132 // use.
133 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
134                              MachineBasicBlock::iterator I,
135                              const SIInstrInfo *TII, Register SpillReg,
136                              Register ScratchRsrcReg, Register SPReg, int FI) {
137   MachineFunction *MF = MBB.getParent();
138   MachineFrameInfo &MFI = MF->getFrameInfo();
139 
140   int64_t Offset = MFI.getObjectOffset(FI);
141 
142   MachineMemOperand *MMO = MF->getMachineMemOperand(
143       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
144       MFI.getObjectAlign(FI));
145 
146   if (isUInt<12>(Offset)) {
147     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
148       .addReg(SpillReg, RegState::Kill)
149       .addReg(ScratchRsrcReg)
150       .addReg(SPReg)
151       .addImm(Offset)
152       .addImm(0) // glc
153       .addImm(0) // slc
154       .addImm(0) // tfe
155       .addImm(0) // dlc
156       .addImm(0) // swz
157       .addMemOperand(MMO);
158     return;
159   }
160 
161   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
162     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
163 
164   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
165     .addImm(Offset);
166 
167   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFEN))
168     .addReg(SpillReg, RegState::Kill)
169     .addReg(OffsetReg, RegState::Kill)
170     .addReg(ScratchRsrcReg)
171     .addReg(SPReg)
172     .addImm(0)
173     .addImm(0) // glc
174     .addImm(0) // slc
175     .addImm(0) // tfe
176     .addImm(0) // dlc
177     .addImm(0) // swz
178     .addMemOperand(MMO);
179 }
180 
181 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
182                               MachineBasicBlock::iterator I,
183                               const SIInstrInfo *TII, Register SpillReg,
184                               Register ScratchRsrcReg, Register SPReg, int FI) {
185   MachineFunction *MF = MBB.getParent();
186   MachineFrameInfo &MFI = MF->getFrameInfo();
187   int64_t Offset = MFI.getObjectOffset(FI);
188 
189   MachineMemOperand *MMO = MF->getMachineMemOperand(
190       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
191       MFI.getObjectAlign(FI));
192 
193   if (isUInt<12>(Offset)) {
194     BuildMI(MBB, I, DebugLoc(),
195             TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFSET), SpillReg)
196       .addReg(ScratchRsrcReg)
197       .addReg(SPReg)
198       .addImm(Offset)
199       .addImm(0) // glc
200       .addImm(0) // slc
201       .addImm(0) // tfe
202       .addImm(0) // dlc
203       .addImm(0) // swz
204       .addMemOperand(MMO);
205     return;
206   }
207 
208   MCPhysReg OffsetReg = findScratchNonCalleeSaveRegister(
209     MF->getRegInfo(), LiveRegs, AMDGPU::VGPR_32RegClass);
210 
211   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32), OffsetReg)
212     .addImm(Offset);
213 
214   BuildMI(MBB, I, DebugLoc(),
215           TII->get(AMDGPU::BUFFER_LOAD_DWORD_OFFEN), SpillReg)
216     .addReg(OffsetReg, RegState::Kill)
217     .addReg(ScratchRsrcReg)
218     .addReg(SPReg)
219     .addImm(0)
220     .addImm(0) // glc
221     .addImm(0) // slc
222     .addImm(0) // tfe
223     .addImm(0) // dlc
224     .addImm(0) // swz
225     .addMemOperand(MMO);
226 }
227 
228 // Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
229 void SIFrameLowering::emitEntryFunctionFlatScratchInit(
230     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
231     const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
232   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
233   const SIInstrInfo *TII = ST.getInstrInfo();
234   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
235   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
236 
237   // We don't need this if we only have spills since there is no user facing
238   // scratch.
239 
240   // TODO: If we know we don't have flat instructions earlier, we can omit
241   // this from the input registers.
242   //
243   // TODO: We only need to know if we access scratch space through a flat
244   // pointer. Because we only detect if flat instructions are used at all,
245   // this will be used more often than necessary on VI.
246 
247   Register FlatScratchInitReg =
248       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
249 
250   MachineRegisterInfo &MRI = MF.getRegInfo();
251   MRI.addLiveIn(FlatScratchInitReg);
252   MBB.addLiveIn(FlatScratchInitReg);
253 
254   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
255   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
256 
257   // Do a 64-bit pointer add.
258   if (ST.flatScratchIsPointer()) {
259     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
260       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
261         .addReg(FlatScrInitLo)
262         .addReg(ScratchWaveOffsetReg);
263       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), FlatScrInitHi)
264         .addReg(FlatScrInitHi)
265         .addImm(0);
266       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
267         addReg(FlatScrInitLo).
268         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_LO |
269                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
270       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_B32)).
271         addReg(FlatScrInitHi).
272         addImm(int16_t(AMDGPU::Hwreg::ID_FLAT_SCR_HI |
273                        (31 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_)));
274       return;
275     }
276 
277     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
278       .addReg(FlatScrInitLo)
279       .addReg(ScratchWaveOffsetReg);
280     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
281       .addReg(FlatScrInitHi)
282       .addImm(0);
283 
284     return;
285   }
286 
287   assert(ST.getGeneration() < AMDGPUSubtarget::GFX10);
288 
289   // Copy the size in bytes.
290   BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
291     .addReg(FlatScrInitHi, RegState::Kill);
292 
293   // Add wave offset in bytes to private base offset.
294   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
295   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
296     .addReg(FlatScrInitLo)
297     .addReg(ScratchWaveOffsetReg);
298 
299   // Convert offset to 256-byte units.
300   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_LSHR_B32), AMDGPU::FLAT_SCR_HI)
301     .addReg(FlatScrInitLo, RegState::Kill)
302     .addImm(8);
303 }
304 
305 // Shift down registers reserved for the scratch RSRC.
306 Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
307     MachineFunction &MF) const {
308 
309   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
310   const SIInstrInfo *TII = ST.getInstrInfo();
311   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
312   MachineRegisterInfo &MRI = MF.getRegInfo();
313   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
314 
315   assert(MFI->isEntryFunction());
316 
317   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
318 
319   if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
320     return Register();
321 
322   if (ST.hasSGPRInitBug() ||
323       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
324     return ScratchRsrcReg;
325 
326   // We reserved the last registers for this. Shift it down to the end of those
327   // which were actually used.
328   //
329   // FIXME: It might be safer to use a pseudoregister before replacement.
330 
331   // FIXME: We should be able to eliminate unused input registers. We only
332   // cannot do this for the resources required for scratch access. For now we
333   // skip over user SGPRs and may leave unused holes.
334 
335   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
336   ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
337   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
338 
339   // Skip the last N reserved elements because they should have already been
340   // reserved for VCC etc.
341   Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
342   for (MCPhysReg Reg : AllSGPR128s) {
343     // Pick the first unallocated one. Make sure we don't clobber the other
344     // reserved input we needed. Also for PAL, make sure we don't clobber
345     // the GIT pointer passed in SGPR0 or SGPR8.
346     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
347         !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
348       MRI.replaceRegWith(ScratchRsrcReg, Reg);
349       MFI->setScratchRSrcReg(Reg);
350       return Reg;
351     }
352   }
353 
354   return ScratchRsrcReg;
355 }
356 
357 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
358                                                 MachineBasicBlock &MBB) const {
359   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
360 
361   // FIXME: If we only have SGPR spills, we won't actually be using scratch
362   // memory since these spill to VGPRs. We should be cleaning up these unused
363   // SGPR spill frame indices somewhere.
364 
365   // FIXME: We still have implicit uses on SGPR spill instructions in case they
366   // need to spill to vector memory. It's likely that will not happen, but at
367   // this point it appears we need the setup. This part of the prolog should be
368   // emitted after frame indices are eliminated.
369 
370   // FIXME: Remove all of the isPhysRegUsed checks
371 
372   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
373   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
374   const SIInstrInfo *TII = ST.getInstrInfo();
375   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
376   MachineRegisterInfo &MRI = MF.getRegInfo();
377   const Function &F = MF.getFunction();
378 
379   assert(MFI->isEntryFunction());
380 
381   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
382       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
383   // FIXME: Hack to not crash in situations which emitted an error.
384   if (!PreloadedScratchWaveOffsetReg)
385     return;
386 
387   // We need to do the replacement of the private segment buffer register even
388   // if there are no stack objects. There could be stores to undef or a
389   // constant without an associated object.
390   //
391   // This will return `Register()` in cases where there are no actual
392   // uses of the SRSRC.
393   Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
394 
395   // Make the selected register live throughout the function.
396   if (ScratchRsrcReg) {
397     for (MachineBasicBlock &OtherBB : MF) {
398       if (&OtherBB != &MBB) {
399         OtherBB.addLiveIn(ScratchRsrcReg);
400       }
401     }
402   }
403 
404   // Now that we have fixed the reserved SRSRC we need to locate the
405   // (potentially) preloaded SRSRC.
406   Register PreloadedScratchRsrcReg;
407   if (ST.isAmdHsaOrMesa(F)) {
408     PreloadedScratchRsrcReg =
409         MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
410     if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
411       // We added live-ins during argument lowering, but since they were not
412       // used they were deleted. We're adding the uses now, so add them back.
413       MRI.addLiveIn(PreloadedScratchRsrcReg);
414       MBB.addLiveIn(PreloadedScratchRsrcReg);
415     }
416   }
417 
418   // Debug location must be unknown since the first debug location is used to
419   // determine the end of the prologue.
420   DebugLoc DL;
421   MachineBasicBlock::iterator I = MBB.begin();
422 
423   // We found the SRSRC first because it needs four registers and has an
424   // alignment requirement. If the SRSRC that we found is clobbering with
425   // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
426   // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
427   // wave offset to a free SGPR.
428   Register ScratchWaveOffsetReg;
429   if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
430     ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
431     unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
432     AllSGPRs = AllSGPRs.slice(
433         std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
434     Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
435     for (MCPhysReg Reg : AllSGPRs) {
436       if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
437           !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
438         ScratchWaveOffsetReg = Reg;
439         BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
440             .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
441         break;
442       }
443     }
444   } else {
445     ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
446   }
447   assert(ScratchWaveOffsetReg);
448 
449   if (MF.getFrameInfo().hasCalls()) {
450     Register SPReg = MFI->getStackPtrOffsetReg();
451     assert(SPReg != AMDGPU::SP_REG);
452     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
453         .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
454   }
455 
456   if (hasFP(MF)) {
457     Register FPReg = MFI->getFrameOffsetReg();
458     assert(FPReg != AMDGPU::FP_REG);
459     BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
460   }
461 
462   if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
463     MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
464     MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
465   }
466 
467   if (MFI->hasFlatScratchInit()) {
468     emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
469   }
470 
471   if (ScratchRsrcReg) {
472     emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
473                                          PreloadedScratchRsrcReg,
474                                          ScratchRsrcReg, ScratchWaveOffsetReg);
475   }
476 }
477 
478 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
479 void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
480     MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
481     const DebugLoc &DL, Register PreloadedScratchRsrcReg,
482     Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
483 
484   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
485   const SIInstrInfo *TII = ST.getInstrInfo();
486   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
487   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
488   const Function &Fn = MF.getFunction();
489 
490   if (ST.isAmdPalOS()) {
491     // The pointer to the GIT is formed from the offset passed in and either
492     // the amdgpu-git-ptr-high function attribute or the top part of the PC
493     Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
494     Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
495     Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
496 
497     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
498 
499     if (MFI->getGITPtrHigh() != 0xffffffff) {
500       BuildMI(MBB, I, DL, SMovB32, RsrcHi)
501         .addImm(MFI->getGITPtrHigh())
502         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
503     } else {
504       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
505       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
506     }
507     Register GitPtrLo = MFI->getGITPtrLoReg(MF);
508     MF.getRegInfo().addLiveIn(GitPtrLo);
509     MBB.addLiveIn(GitPtrLo);
510     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
511       .addReg(GitPtrLo)
512       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
513 
514     // We now have the GIT ptr - now get the scratch descriptor from the entry
515     // at offset 0 (or offset 16 for a compute shader).
516     MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
517     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
518     auto MMO = MF.getMachineMemOperand(PtrInfo,
519                                        MachineMemOperand::MOLoad |
520                                            MachineMemOperand::MOInvariant |
521                                            MachineMemOperand::MODereferenceable,
522                                        16, Align(4));
523     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
524     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
525     unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
526     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
527       .addReg(Rsrc01)
528       .addImm(EncodedOffset) // offset
529       .addImm(0) // glc
530       .addImm(0) // dlc
531       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
532       .addMemOperand(MMO);
533   } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
534     assert(!ST.isAmdHsaOrMesa(Fn));
535     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
536 
537     Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
538     Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
539 
540     // Use relocations to get the pointer, and setup the other bits manually.
541     uint64_t Rsrc23 = TII->getScratchRsrcWords23();
542 
543     if (MFI->hasImplicitBufferPtr()) {
544       Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
545 
546       if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
547         const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
548 
549         BuildMI(MBB, I, DL, Mov64, Rsrc01)
550           .addReg(MFI->getImplicitBufferPtrUserSGPR())
551           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
552       } else {
553         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
554 
555         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
556         auto MMO = MF.getMachineMemOperand(
557             PtrInfo,
558             MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
559                 MachineMemOperand::MODereferenceable,
560             8, Align(4));
561         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
562           .addReg(MFI->getImplicitBufferPtrUserSGPR())
563           .addImm(0) // offset
564           .addImm(0) // glc
565           .addImm(0) // dlc
566           .addMemOperand(MMO)
567           .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
568 
569         MF.getRegInfo().addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
570         MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
571       }
572     } else {
573       Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
574       Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
575 
576       BuildMI(MBB, I, DL, SMovB32, Rsrc0)
577         .addExternalSymbol("SCRATCH_RSRC_DWORD0")
578         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
579 
580       BuildMI(MBB, I, DL, SMovB32, Rsrc1)
581         .addExternalSymbol("SCRATCH_RSRC_DWORD1")
582         .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
583 
584     }
585 
586     BuildMI(MBB, I, DL, SMovB32, Rsrc2)
587       .addImm(Rsrc23 & 0xffffffff)
588       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
589 
590     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
591       .addImm(Rsrc23 >> 32)
592       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
593   } else if (ST.isAmdHsaOrMesa(Fn)) {
594     assert(PreloadedScratchRsrcReg);
595 
596     if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
597       BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
598           .addReg(PreloadedScratchRsrcReg, RegState::Kill);
599     }
600   }
601 
602   // Add the scratch wave offset into the scratch RSRC.
603   //
604   // We only want to update the first 48 bits, which is the base address
605   // pointer, without touching the adjacent 16 bits of flags. We know this add
606   // cannot carry-out from bit 47, otherwise the scratch allocation would be
607   // impossible to fit in the 48-bit global address space.
608   //
609   // TODO: Evaluate if it is better to just construct an SRD using the flat
610   // scratch init and some constants rather than update the one we are passed.
611   Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
612   Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
613 
614   // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
615   // the kernel body via inreg arguments.
616   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
617       .addReg(ScratchRsrcSub0)
618       .addReg(ScratchWaveOffsetReg)
619       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
620   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
621       .addReg(ScratchRsrcSub1)
622       .addImm(0)
623       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
624 }
625 
626 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
627   switch (ID) {
628   case TargetStackID::Default:
629   case TargetStackID::NoAlloc:
630   case TargetStackID::SGPRSpill:
631     return true;
632   case TargetStackID::SVEVector:
633     return false;
634   }
635   llvm_unreachable("Invalid TargetStackID::Value");
636 }
637 
638 // Activate all lanes, returns saved exec.
639 static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
640                                      MachineFunction &MF,
641                                      MachineBasicBlock &MBB,
642                                      MachineBasicBlock::iterator MBBI,
643                                      bool IsProlog) {
644   Register ScratchExecCopy;
645   MachineRegisterInfo &MRI = MF.getRegInfo();
646   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
647   const SIInstrInfo *TII = ST.getInstrInfo();
648   const SIRegisterInfo &TRI = TII->getRegisterInfo();
649   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
650   DebugLoc DL;
651 
652   if (LiveRegs.empty()) {
653     if (IsProlog) {
654       LiveRegs.init(TRI);
655       LiveRegs.addLiveIns(MBB);
656       if (FuncInfo->SGPRForFPSaveRestoreCopy)
657         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
658 
659       if (FuncInfo->SGPRForBPSaveRestoreCopy)
660         LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
661     } else {
662       // In epilog.
663       LiveRegs.init(*ST.getRegisterInfo());
664       LiveRegs.addLiveOuts(MBB);
665       LiveRegs.stepBackward(*MBBI);
666     }
667   }
668 
669   ScratchExecCopy = findScratchNonCalleeSaveRegister(
670       MRI, LiveRegs, *TRI.getWaveMaskRegClass());
671 
672   if (!IsProlog)
673     LiveRegs.removeReg(ScratchExecCopy);
674 
675   const unsigned OrSaveExec =
676       ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
677   BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
678 
679   return ScratchExecCopy;
680 }
681 
682 void SIFrameLowering::emitPrologue(MachineFunction &MF,
683                                    MachineBasicBlock &MBB) const {
684   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
685   if (FuncInfo->isEntryFunction()) {
686     emitEntryFunctionPrologue(MF, MBB);
687     return;
688   }
689 
690   const MachineFrameInfo &MFI = MF.getFrameInfo();
691   MachineRegisterInfo &MRI = MF.getRegInfo();
692   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
693   const SIInstrInfo *TII = ST.getInstrInfo();
694   const SIRegisterInfo &TRI = TII->getRegisterInfo();
695 
696   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
697   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
698   Register BasePtrReg =
699       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
700   LivePhysRegs LiveRegs;
701 
702   MachineBasicBlock::iterator MBBI = MBB.begin();
703   DebugLoc DL;
704 
705   bool HasFP = false;
706   bool HasBP = false;
707   uint32_t NumBytes = MFI.getStackSize();
708   uint32_t RoundedSize = NumBytes;
709   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
710   // turn on all lanes before doing the spill to memory.
711   Register ScratchExecCopy;
712 
713   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
714   bool SpillFPToMemory = false;
715   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
716   // Otherwise we are spilling the FP to memory.
717   if (HasFPSaveIndex) {
718     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
719                       TargetStackID::SGPRSpill;
720   }
721 
722   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
723   bool SpillBPToMemory = false;
724   // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
725   // Otherwise we are spilling the BP to memory.
726   if (HasBPSaveIndex) {
727     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
728                       TargetStackID::SGPRSpill;
729   }
730 
731   // Emit the copy if we need an FP, and are using a free SGPR to save it.
732   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
733     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
734       .addReg(FramePtrReg)
735       .setMIFlag(MachineInstr::FrameSetup);
736   }
737 
738   // Emit the copy if we need a BP, and are using a free SGPR to save it.
739   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
740     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
741             FuncInfo->SGPRForBPSaveRestoreCopy)
742         .addReg(BasePtrReg)
743         .setMIFlag(MachineInstr::FrameSetup);
744   }
745 
746   // If a copy has been emitted for FP and/or BP, Make the SGPRs
747   // used in the copy instructions live throughout the function.
748   SmallVector<MCPhysReg, 2> TempSGPRs;
749   if (FuncInfo->SGPRForFPSaveRestoreCopy)
750     TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
751 
752   if (FuncInfo->SGPRForBPSaveRestoreCopy)
753     TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
754 
755   if (!TempSGPRs.empty()) {
756     for (MachineBasicBlock &MBB : MF) {
757       for (MCPhysReg Reg : TempSGPRs)
758         MBB.addLiveIn(Reg);
759 
760       MBB.sortUniqueLiveIns();
761     }
762   }
763 
764   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
765          : FuncInfo->getSGPRSpillVGPRs()) {
766     if (!Reg.FI.hasValue())
767       continue;
768 
769     if (!ScratchExecCopy)
770       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
771 
772     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
773                      FuncInfo->getScratchRSrcReg(),
774                      StackPtrReg,
775                      Reg.FI.getValue());
776   }
777 
778   if (HasFPSaveIndex && SpillFPToMemory) {
779     assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
780 
781     if (!ScratchExecCopy)
782       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
783 
784     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
785         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
786 
787     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
788         .addReg(FramePtrReg);
789 
790     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
791                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
792                      FuncInfo->FramePointerSaveIndex.getValue());
793   }
794 
795   if (HasBPSaveIndex && SpillBPToMemory) {
796     assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
797 
798     if (!ScratchExecCopy)
799       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
800 
801     MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
802         MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
803 
804     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
805         .addReg(BasePtrReg);
806 
807     buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
808                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
809                      *FuncInfo->BasePointerSaveIndex);
810   }
811 
812   if (ScratchExecCopy) {
813     // FIXME: Split block and make terminator.
814     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
815     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
816     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
817         .addReg(ScratchExecCopy, RegState::Kill);
818     LiveRegs.addReg(ScratchExecCopy);
819   }
820 
821   // In this case, spill the FP to a reserved VGPR.
822   if (HasFPSaveIndex && !SpillFPToMemory) {
823     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
824     assert(!MFI.isDeadObjectIndex(FI));
825 
826     assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
827     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
828         FuncInfo->getSGPRToVGPRSpills(FI);
829     assert(Spill.size() == 1);
830 
831     // Save FP before setting it up.
832     // FIXME: This should respect spillSGPRToVGPR;
833     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
834             Spill[0].VGPR)
835         .addReg(FramePtrReg)
836         .addImm(Spill[0].Lane)
837         .addReg(Spill[0].VGPR, RegState::Undef);
838   }
839 
840   // In this case, spill the BP to a reserved VGPR.
841   if (HasBPSaveIndex && !SpillBPToMemory) {
842     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
843     assert(!MFI.isDeadObjectIndex(BasePtrFI));
844 
845     assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
846     ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
847         FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
848     assert(Spill.size() == 1);
849 
850     // Save BP before setting it up.
851     // FIXME: This should respect spillSGPRToVGPR;
852     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
853             Spill[0].VGPR)
854         .addReg(BasePtrReg)
855         .addImm(Spill[0].Lane)
856         .addReg(Spill[0].VGPR, RegState::Undef);
857   }
858 
859   if (TRI.needsStackRealignment(MF)) {
860     HasFP = true;
861     const unsigned Alignment = MFI.getMaxAlign().value();
862 
863     RoundedSize += Alignment;
864     if (LiveRegs.empty()) {
865       LiveRegs.init(TRI);
866       LiveRegs.addLiveIns(MBB);
867       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
868       LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
869     }
870 
871     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
872         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
873     assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
874            ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
875 
876     // s_add_u32 tmp_reg, s32, NumBytes
877     // s_and_b32 s32, tmp_reg, 0b111...0000
878     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
879         .addReg(StackPtrReg)
880         .addImm((Alignment - 1) * ST.getWavefrontSize())
881         .setMIFlag(MachineInstr::FrameSetup);
882     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
883         .addReg(ScratchSPReg, RegState::Kill)
884         .addImm(-Alignment * ST.getWavefrontSize())
885         .setMIFlag(MachineInstr::FrameSetup);
886     FuncInfo->setIsStackRealigned(true);
887   } else if ((HasFP = hasFP(MF))) {
888     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
889         .addReg(StackPtrReg)
890         .setMIFlag(MachineInstr::FrameSetup);
891   }
892 
893   // If we need a base pointer, set it up here. It's whatever the value of
894   // the stack pointer is at this point. Any variable size objects will be
895   // allocated after this, so we can still use the base pointer to reference
896   // the incoming arguments.
897   if ((HasBP = TRI.hasBasePointer(MF))) {
898     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
899         .addReg(StackPtrReg)
900         .setMIFlag(MachineInstr::FrameSetup);
901   }
902 
903   if (HasFP && RoundedSize != 0) {
904     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
905         .addReg(StackPtrReg)
906         .addImm(RoundedSize * ST.getWavefrontSize())
907         .setMIFlag(MachineInstr::FrameSetup);
908   }
909 
910   assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
911                      FuncInfo->FramePointerSaveIndex)) &&
912          "Needed to save FP but didn't save it anywhere");
913 
914   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
915                     !FuncInfo->FramePointerSaveIndex)) &&
916          "Saved FP but didn't need it");
917 
918   assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
919                      FuncInfo->BasePointerSaveIndex)) &&
920          "Needed to save BP but didn't save it anywhere");
921 
922   assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
923                     !FuncInfo->BasePointerSaveIndex)) &&
924          "Saved BP but didn't need it");
925 }
926 
927 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
928                                    MachineBasicBlock &MBB) const {
929   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
930   if (FuncInfo->isEntryFunction())
931     return;
932 
933   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
934   const SIInstrInfo *TII = ST.getInstrInfo();
935   MachineRegisterInfo &MRI = MF.getRegInfo();
936   const SIRegisterInfo &TRI = TII->getRegisterInfo();
937   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
938   LivePhysRegs LiveRegs;
939   DebugLoc DL;
940 
941   const MachineFrameInfo &MFI = MF.getFrameInfo();
942   uint32_t NumBytes = MFI.getStackSize();
943   uint32_t RoundedSize = FuncInfo->isStackRealigned()
944                              ? NumBytes + MFI.getMaxAlign().value()
945                              : NumBytes;
946   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
947   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
948   const Register BasePtrReg =
949       TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
950 
951   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
952   bool SpillFPToMemory = false;
953   if (HasFPSaveIndex) {
954     SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
955                       TargetStackID::SGPRSpill;
956   }
957 
958   bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
959   bool SpillBPToMemory = false;
960   if (HasBPSaveIndex) {
961     SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
962                       TargetStackID::SGPRSpill;
963   }
964 
965   if (RoundedSize != 0 && hasFP(MF)) {
966     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
967       .addReg(StackPtrReg)
968       .addImm(RoundedSize * ST.getWavefrontSize())
969       .setMIFlag(MachineInstr::FrameDestroy);
970   }
971 
972   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
973     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
974         .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
975         .setMIFlag(MachineInstr::FrameSetup);
976   }
977 
978   if (FuncInfo->SGPRForBPSaveRestoreCopy) {
979     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
980         .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
981         .setMIFlag(MachineInstr::FrameSetup);
982   }
983 
984   Register ScratchExecCopy;
985   if (HasFPSaveIndex) {
986     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
987     assert(!MFI.isDeadObjectIndex(FI));
988     if (SpillFPToMemory) {
989       if (!ScratchExecCopy)
990         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
991 
992       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
993           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
994       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
995                         FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
996       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
997           .addReg(TempVGPR, RegState::Kill);
998     } else {
999       // Reload from VGPR spill.
1000       assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
1001       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1002           FuncInfo->getSGPRToVGPRSpills(FI);
1003       assert(Spill.size() == 1);
1004       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1005               FramePtrReg)
1006           .addReg(Spill[0].VGPR)
1007           .addImm(Spill[0].Lane);
1008     }
1009   }
1010 
1011   if (HasBPSaveIndex) {
1012     const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
1013     assert(!MFI.isDeadObjectIndex(BasePtrFI));
1014     if (SpillBPToMemory) {
1015       if (!ScratchExecCopy)
1016         ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1017 
1018       MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
1019           MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
1020       buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
1021                         FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
1022       BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
1023           .addReg(TempVGPR, RegState::Kill);
1024     } else {
1025       // Reload from VGPR spill.
1026       assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
1027       ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
1028           FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
1029       assert(Spill.size() == 1);
1030       BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
1031               BasePtrReg)
1032           .addReg(Spill[0].VGPR)
1033           .addImm(Spill[0].Lane);
1034     }
1035   }
1036 
1037   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
1038        FuncInfo->getSGPRSpillVGPRs()) {
1039     if (!Reg.FI.hasValue())
1040       continue;
1041 
1042     if (!ScratchExecCopy)
1043       ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
1044 
1045     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
1046                       FuncInfo->getScratchRSrcReg(), StackPtrReg,
1047                       Reg.FI.getValue());
1048   }
1049 
1050   if (ScratchExecCopy) {
1051     // FIXME: Split block and make terminator.
1052     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1053     MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1054     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
1055         .addReg(ScratchExecCopy, RegState::Kill);
1056   }
1057 }
1058 
1059 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
1060 // memory. They should have been removed by now.
1061 static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
1062   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1063        I != E; ++I) {
1064     if (!MFI.isDeadObjectIndex(I))
1065       return false;
1066   }
1067 
1068   return true;
1069 }
1070 
1071 #ifndef NDEBUG
1072 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
1073                                  Optional<int> FramePointerSaveIndex,
1074                                  Optional<int> BasePointerSaveIndex) {
1075   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
1076        I != E; ++I) {
1077     if (!MFI.isDeadObjectIndex(I) &&
1078         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
1079         ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
1080          (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
1081       return false;
1082     }
1083   }
1084 
1085   return true;
1086 }
1087 #endif
1088 
1089 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
1090                                             Register &FrameReg) const {
1091   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
1092 
1093   FrameReg = RI->getFrameRegister(MF);
1094   return MF.getFrameInfo().getObjectOffset(FI);
1095 }
1096 
1097 void SIFrameLowering::processFunctionBeforeFrameFinalized(
1098   MachineFunction &MF,
1099   RegScavenger *RS) const {
1100   MachineFrameInfo &MFI = MF.getFrameInfo();
1101 
1102   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1103   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1104   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1105 
1106   FuncInfo->removeDeadFrameIndices(MFI);
1107   assert(allSGPRSpillsAreDead(MFI, None, None) &&
1108          "SGPR spill should have been removed in SILowerSGPRSpills");
1109 
1110   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
1111   // but currently hasNonSpillStackObjects is set only from source
1112   // allocas. Stack temps produced from legalization are not counted currently.
1113   if (!allStackObjectsAreDead(MFI)) {
1114     assert(RS && "RegScavenger required if spilling");
1115 
1116     if (FuncInfo->isEntryFunction()) {
1117       int ScavengeFI = MFI.CreateFixedObject(
1118         TRI->getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
1119       RS->addScavengingFrameIndex(ScavengeFI);
1120     } else {
1121       int ScavengeFI = MFI.CreateStackObject(
1122           TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
1123           TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
1124       RS->addScavengingFrameIndex(ScavengeFI);
1125     }
1126   }
1127 }
1128 
1129 // Only report VGPRs to generic code.
1130 void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
1131                                            BitVector &SavedVGPRs,
1132                                            RegScavenger *RS) const {
1133   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
1134   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1135   if (MFI->isEntryFunction())
1136     return;
1137 
1138   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
1139   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1140   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1141 
1142   // Ignore the SGPRs the default implementation found.
1143   SavedVGPRs.clearBitsNotInMask(TRI->getAllVGPRRegMask());
1144 
1145   // hasFP only knows about stack objects that already exist. We're now
1146   // determining the stack slots that will be created, so we have to predict
1147   // them. Stack objects force FP usage with calls.
1148   //
1149   // Note a new VGPR CSR may be introduced if one is used for the spill, but we
1150   // don't want to report it here.
1151   //
1152   // FIXME: Is this really hasReservedCallFrame?
1153   const bool WillHaveFP =
1154       FrameInfo.hasCalls() &&
1155       (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
1156 
1157   // VGPRs used for SGPR spilling need to be specially inserted in the prolog,
1158   // so don't allow the default insertion to handle them.
1159   for (auto SSpill : MFI->getSGPRSpillVGPRs())
1160     SavedVGPRs.reset(SSpill.VGPR);
1161 
1162   LivePhysRegs LiveRegs;
1163   LiveRegs.init(*TRI);
1164 
1165   if (WillHaveFP || hasFP(MF)) {
1166     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
1167                                    MFI->FramePointerSaveIndex, true);
1168   }
1169 
1170   if (TRI->hasBasePointer(MF)) {
1171     if (MFI->SGPRForFPSaveRestoreCopy)
1172       LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
1173     getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
1174                                    MFI->BasePointerSaveIndex, false);
1175   }
1176 }
1177 
1178 void SIFrameLowering::determineCalleeSavesSGPR(MachineFunction &MF,
1179                                                BitVector &SavedRegs,
1180                                                RegScavenger *RS) const {
1181   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
1182   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1183   if (MFI->isEntryFunction())
1184     return;
1185 
1186   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1187   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1188 
1189   // The SP is specifically managed and we don't want extra spills of it.
1190   SavedRegs.reset(MFI->getStackPtrOffsetReg());
1191   SavedRegs.clearBitsInMask(TRI->getAllVGPRRegMask());
1192 }
1193 
1194 bool SIFrameLowering::assignCalleeSavedSpillSlots(
1195     MachineFunction &MF, const TargetRegisterInfo *TRI,
1196     std::vector<CalleeSavedInfo> &CSI) const {
1197   if (CSI.empty())
1198     return true; // Early exit if no callee saved registers are modified!
1199 
1200   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
1201   if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
1202       !FuncInfo->SGPRForBPSaveRestoreCopy)
1203     return false;
1204 
1205   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1206   const SIRegisterInfo *RI = ST.getRegisterInfo();
1207   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
1208   Register BasePtrReg = RI->getBaseRegister();
1209   unsigned NumModifiedRegs = 0;
1210 
1211   if (FuncInfo->SGPRForFPSaveRestoreCopy)
1212     NumModifiedRegs++;
1213   if (FuncInfo->SGPRForBPSaveRestoreCopy)
1214     NumModifiedRegs++;
1215 
1216   for (auto &CS : CSI) {
1217     if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
1218       CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
1219       if (--NumModifiedRegs)
1220         break;
1221     } else if (CS.getReg() == BasePtrReg &&
1222                FuncInfo->SGPRForBPSaveRestoreCopy) {
1223       CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
1224       if (--NumModifiedRegs)
1225         break;
1226     }
1227   }
1228 
1229   return false;
1230 }
1231 
1232 MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
1233   MachineFunction &MF,
1234   MachineBasicBlock &MBB,
1235   MachineBasicBlock::iterator I) const {
1236   int64_t Amount = I->getOperand(0).getImm();
1237   if (Amount == 0)
1238     return MBB.erase(I);
1239 
1240   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
1241   const SIInstrInfo *TII = ST.getInstrInfo();
1242   const DebugLoc &DL = I->getDebugLoc();
1243   unsigned Opc = I->getOpcode();
1244   bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
1245   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
1246 
1247   if (!hasReservedCallFrame(MF)) {
1248     Amount = alignTo(Amount, getStackAlign());
1249     assert(isUInt<32>(Amount) && "exceeded stack address space size");
1250     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1251     Register SPReg = MFI->getStackPtrOffsetReg();
1252 
1253     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
1254     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
1255       .addReg(SPReg)
1256       .addImm(Amount * ST.getWavefrontSize());
1257   } else if (CalleePopAmount != 0) {
1258     llvm_unreachable("is this used?");
1259   }
1260 
1261   return MBB.erase(I);
1262 }
1263 
1264 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
1265   const MachineFrameInfo &MFI = MF.getFrameInfo();
1266 
1267   // For entry functions we can use an immediate offset in most cases, so the
1268   // presence of calls doesn't imply we need a distinct frame pointer.
1269   if (MFI.hasCalls() &&
1270       !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
1271     // All offsets are unsigned, so need to be addressed in the same direction
1272     // as stack growth.
1273 
1274     // FIXME: This function is pretty broken, since it can be called before the
1275     // frame layout is determined or CSR spills are inserted.
1276     return MFI.getStackSize() != 0;
1277   }
1278 
1279   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
1280     MFI.hasStackMap() || MFI.hasPatchPoint() ||
1281     MF.getSubtarget<GCNSubtarget>().getRegisterInfo()->needsStackRealignment(MF) ||
1282     MF.getTarget().Options.DisableFramePointerElim(MF);
1283 }
1284