1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "SIRegisterInfo.h"
15 #include "AMDGPU.h"
16 #include "AMDGPURegisterBankInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUInstPrinter.h"
19 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/MachineDominators.h"
23 #include "llvm/CodeGen/RegisterScavenging.h"
24 
25 using namespace llvm;
26 
27 #define GET_REGINFO_TARGET_DESC
28 #include "AMDGPUGenRegisterInfo.inc"
29 
30 static cl::opt<bool> EnableSpillSGPRToVGPR(
31   "amdgpu-spill-sgpr-to-vgpr",
32   cl::desc("Enable spilling VGPRs to SGPRs"),
33   cl::ReallyHidden,
34   cl::init(true));
35 
36 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
37 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
38 
39 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
40 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
41 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
42 //      meaning index 7 in SubRegFromChannelTable.
43 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
44     0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
45 
46 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
47     : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
48       SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
49 
50   assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
51          getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
52          (getSubRegIndexLaneMask(AMDGPU::lo16) |
53           getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
54            getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
55          "getNumCoveredRegs() will not work with generated subreg masks!");
56 
57   RegPressureIgnoredUnits.resize(getNumRegUnits());
58   RegPressureIgnoredUnits.set(
59       *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
60   for (auto Reg : AMDGPU::VGPR_HI16RegClass)
61     RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
62 
63   // HACK: Until this is fully tablegen'd.
64   static llvm::once_flag InitializeRegSplitPartsFlag;
65 
66   static auto InitializeRegSplitPartsOnce = [this]() {
67     for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
68       unsigned Size = getSubRegIdxSize(Idx);
69       if (Size & 31)
70         continue;
71       std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
72       unsigned Pos = getSubRegIdxOffset(Idx);
73       if (Pos % Size)
74         continue;
75       Pos /= Size;
76       if (Vec.empty()) {
77         unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
78         Vec.resize(MaxNumParts);
79       }
80       Vec[Pos] = Idx;
81     }
82   };
83 
84   static llvm::once_flag InitializeSubRegFromChannelTableFlag;
85 
86   static auto InitializeSubRegFromChannelTableOnce = [this]() {
87     for (auto &Row : SubRegFromChannelTable)
88       Row.fill(AMDGPU::NoSubRegister);
89     for (uint16_t Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
90       unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
91       unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
92       assert(Width < SubRegFromChannelTableWidthMap.size());
93       Width = SubRegFromChannelTableWidthMap[Width];
94       if (Width == 0)
95         continue;
96       unsigned TableIdx = Width - 1;
97       assert(TableIdx < SubRegFromChannelTable.size());
98       assert(Offset < SubRegFromChannelTable[TableIdx].size());
99       SubRegFromChannelTable[TableIdx][Offset] = Idx;
100     }
101   };
102 
103   llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
104   llvm::call_once(InitializeSubRegFromChannelTableFlag,
105                   InitializeSubRegFromChannelTableOnce);
106 }
107 
108 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
109                                            MCRegister Reg) const {
110   MCRegAliasIterator R(Reg, this, true);
111 
112   for (; R.isValid(); ++R)
113     Reserved.set(*R);
114 }
115 
116 // Forced to be here by one .inc
117 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
118   const MachineFunction *MF) const {
119   CallingConv::ID CC = MF->getFunction().getCallingConv();
120   switch (CC) {
121   case CallingConv::C:
122   case CallingConv::Fast:
123   case CallingConv::Cold:
124   case CallingConv::AMDGPU_Gfx:
125     return CSR_AMDGPU_HighRegs_SaveList;
126   default: {
127     // Dummy to not crash RegisterClassInfo.
128     static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
129     return &NoCalleeSavedReg;
130   }
131   }
132 }
133 
134 const MCPhysReg *
135 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
136   return nullptr;
137 }
138 
139 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
140                                                      CallingConv::ID CC) const {
141   switch (CC) {
142   case CallingConv::C:
143   case CallingConv::Fast:
144   case CallingConv::Cold:
145   case CallingConv::AMDGPU_Gfx:
146     return CSR_AMDGPU_HighRegs_RegMask;
147   default:
148     return nullptr;
149   }
150 }
151 
152 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
153   return CSR_AMDGPU_NoRegs_RegMask;
154 }
155 
156 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
157   const SIFrameLowering *TFI =
158       MF.getSubtarget<GCNSubtarget>().getFrameLowering();
159   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
160   // During ISel lowering we always reserve the stack pointer in entry
161   // functions, but never actually want to reference it when accessing our own
162   // frame. If we need a frame pointer we use it, but otherwise we can just use
163   // an immediate "0" which we represent by returning NoRegister.
164   if (FuncInfo->isEntryFunction()) {
165     return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
166   }
167   return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
168                         : FuncInfo->getStackPtrOffsetReg();
169 }
170 
171 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
172   // When we need stack realignment, we can't reference off of the
173   // stack pointer, so we reserve a base pointer.
174   const MachineFrameInfo &MFI = MF.getFrameInfo();
175   return MFI.getNumFixedObjects() && needsStackRealignment(MF);
176 }
177 
178 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
179 
180 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
181   return CSR_AMDGPU_AllVGPRs_RegMask;
182 }
183 
184 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
185   return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
186 }
187 
188 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
189                                               unsigned NumRegs) {
190   assert(NumRegs < SubRegFromChannelTableWidthMap.size());
191   unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
192   assert(NumRegIndex && "Not implemented");
193   assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
194   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
195 }
196 
197 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
198   const MachineFunction &MF) const {
199   unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
200   MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
201   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
202 }
203 
204 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
205   BitVector Reserved(getNumRegs());
206   Reserved.set(AMDGPU::MODE);
207 
208   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
209   // this seems likely to result in bugs, so I'm marking them as reserved.
210   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
211   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
212 
213   // M0 has to be reserved so that llvm accepts it as a live-in into a block.
214   reserveRegisterTuples(Reserved, AMDGPU::M0);
215 
216   // Reserve src_vccz, src_execz, src_scc.
217   reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
218   reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
219   reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
220 
221   // Reserve the memory aperture registers.
222   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
223   reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
224   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
225   reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
226 
227   // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
228   reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
229 
230   // Reserve xnack_mask registers - support is not implemented in Codegen.
231   reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
232 
233   // Reserve lds_direct register - support is not implemented in Codegen.
234   reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
235 
236   // Reserve Trap Handler registers - support is not implemented in Codegen.
237   reserveRegisterTuples(Reserved, AMDGPU::TBA);
238   reserveRegisterTuples(Reserved, AMDGPU::TMA);
239   reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
240   reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
241   reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
242   reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
243   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
244   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
245   reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
246   reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
247 
248   // Reserve null register - it shall never be allocated
249   reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL);
250 
251   // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
252   // will result in bugs.
253   if (isWave32) {
254     Reserved.set(AMDGPU::VCC);
255     Reserved.set(AMDGPU::VCC_HI);
256   }
257 
258   unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
259   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
260   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
261     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
262     reserveRegisterTuples(Reserved, Reg);
263   }
264 
265   unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
266   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
267   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
268     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
269     reserveRegisterTuples(Reserved, Reg);
270     Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
271     reserveRegisterTuples(Reserved, Reg);
272   }
273 
274   for (auto Reg : AMDGPU::SReg_32RegClass) {
275     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
276     Register Low = getSubReg(Reg, AMDGPU::lo16);
277     // This is to prevent BB vcc liveness errors.
278     if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
279       Reserved.set(Low);
280   }
281 
282   for (auto Reg : AMDGPU::AGPR_32RegClass) {
283     Reserved.set(getSubReg(Reg, AMDGPU::hi16));
284   }
285 
286   // Reserve all the rest AGPRs if there are no instructions to use it.
287   if (!ST.hasMAIInsts()) {
288     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
289       unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
290       reserveRegisterTuples(Reserved, Reg);
291     }
292   }
293 
294   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
295 
296   Register ScratchRSrcReg = MFI->getScratchRSrcReg();
297   if (ScratchRSrcReg != AMDGPU::NoRegister) {
298     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
299     // to spill.
300     // TODO: May need to reserve a VGPR if doing LDS spilling.
301     reserveRegisterTuples(Reserved, ScratchRSrcReg);
302   }
303 
304   // We have to assume the SP is needed in case there are calls in the function,
305   // which is detected after the function is lowered. If we aren't really going
306   // to need SP, don't bother reserving it.
307   MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
308 
309   if (StackPtrReg) {
310     reserveRegisterTuples(Reserved, StackPtrReg);
311     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
312   }
313 
314   MCRegister FrameReg = MFI->getFrameOffsetReg();
315   if (FrameReg) {
316     reserveRegisterTuples(Reserved, FrameReg);
317     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
318   }
319 
320   if (hasBasePointer(MF)) {
321     MCRegister BasePtrReg = getBaseRegister();
322     reserveRegisterTuples(Reserved, BasePtrReg);
323     assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
324   }
325 
326   for (MCRegister Reg : MFI->WWMReservedRegs) {
327     reserveRegisterTuples(Reserved, Reg);
328   }
329 
330   // FIXME: Stop using reserved registers for this.
331   for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
332     reserveRegisterTuples(Reserved, Reg);
333 
334   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
335     reserveRegisterTuples(Reserved, Reg);
336 
337   for (auto SSpill : MFI->getSGPRSpillVGPRs())
338     reserveRegisterTuples(Reserved, SSpill.VGPR);
339 
340   return Reserved;
341 }
342 
343 bool SIRegisterInfo::canRealignStack(const MachineFunction &MF) const {
344   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
345   // On entry, the base address is 0, so it can't possibly need any more
346   // alignment.
347 
348   // FIXME: Should be able to specify the entry frame alignment per calling
349   // convention instead.
350   if (Info->isEntryFunction())
351     return false;
352 
353   return TargetRegisterInfo::canRealignStack(MF);
354 }
355 
356 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
357   const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
358   if (Info->isEntryFunction()) {
359     const MachineFrameInfo &MFI = Fn.getFrameInfo();
360     return MFI.hasStackObjects() || MFI.hasCalls();
361   }
362 
363   // May need scavenger for dealing with callee saved registers.
364   return true;
365 }
366 
367 bool SIRegisterInfo::requiresFrameIndexScavenging(
368   const MachineFunction &MF) const {
369   // Do not use frame virtual registers. They used to be used for SGPRs, but
370   // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
371   // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
372   // spill.
373   return false;
374 }
375 
376 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
377   const MachineFunction &MF) const {
378   const MachineFrameInfo &MFI = MF.getFrameInfo();
379   return MFI.hasStackObjects();
380 }
381 
382 bool SIRegisterInfo::requiresVirtualBaseRegisters(
383   const MachineFunction &) const {
384   // There are no special dedicated stack or frame pointers.
385   return true;
386 }
387 
388 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
389   assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
390 
391   int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
392                                           AMDGPU::OpName::offset);
393   return MI->getOperand(OffIdx).getImm();
394 }
395 
396 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
397                                                  int Idx) const {
398   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
399     return 0;
400 
401   assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
402                                             AMDGPU::OpName::vaddr) ||
403          (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
404                                             AMDGPU::OpName::saddr))) &&
405          "Should never see frame index on non-address operand");
406 
407   return getScratchInstrOffset(MI);
408 }
409 
410 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
411   if (!MI->mayLoadOrStore())
412     return false;
413 
414   int64_t FullOffset = Offset + getScratchInstrOffset(MI);
415 
416   if (SIInstrInfo::isMUBUF(*MI))
417     return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
418 
419   const SIInstrInfo *TII = ST.getInstrInfo();
420   return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
421 }
422 
423 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
424                                                       int FrameIdx,
425                                                       int64_t Offset) const {
426   MachineBasicBlock::iterator Ins = MBB->begin();
427   DebugLoc DL; // Defaults to "unknown"
428 
429   if (Ins != MBB->end())
430     DL = Ins->getDebugLoc();
431 
432   MachineFunction *MF = MBB->getParent();
433   const SIInstrInfo *TII = ST.getInstrInfo();
434   MachineRegisterInfo &MRI = MF->getRegInfo();
435   unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
436                                            : AMDGPU::V_MOV_B32_e32;
437 
438   Register BaseReg = MRI.createVirtualRegister(
439       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
440                              : &AMDGPU::VGPR_32RegClass);
441 
442   if (Offset == 0) {
443     BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
444       .addFrameIndex(FrameIdx);
445     return BaseReg;
446   }
447 
448   Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
449 
450   Register FIReg = MRI.createVirtualRegister(
451       ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
452                              : &AMDGPU::VGPR_32RegClass);
453 
454   BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
455     .addImm(Offset);
456   BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
457     .addFrameIndex(FrameIdx);
458 
459   if (ST.enableFlatScratch() ) {
460     BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_U32), BaseReg)
461         .addReg(OffsetReg, RegState::Kill)
462         .addReg(FIReg);
463     return BaseReg;
464   }
465 
466   TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
467     .addReg(OffsetReg, RegState::Kill)
468     .addReg(FIReg)
469     .addImm(0); // clamp bit
470 
471   return BaseReg;
472 }
473 
474 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
475                                        int64_t Offset) const {
476   const SIInstrInfo *TII = ST.getInstrInfo();
477   bool IsFlat = TII->isFLATScratch(MI);
478 
479 #ifndef NDEBUG
480   // FIXME: Is it possible to be storing a frame index to itself?
481   bool SeenFI = false;
482   for (const MachineOperand &MO: MI.operands()) {
483     if (MO.isFI()) {
484       if (SeenFI)
485         llvm_unreachable("should not see multiple frame indices");
486 
487       SeenFI = true;
488     }
489   }
490 #endif
491 
492   MachineOperand *FIOp =
493       TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
494                                       : AMDGPU::OpName::vaddr);
495 
496   MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
497   int64_t NewOffset = OffsetOp->getImm() + Offset;
498 
499   assert(FIOp && FIOp->isFI() && "frame index must be address operand");
500   assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
501 
502   if (IsFlat) {
503     assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true) &&
504            "offset should be legal");
505     FIOp->ChangeToRegister(BaseReg, false);
506     OffsetOp->setImm(NewOffset);
507     return;
508   }
509 
510 #ifndef NDEBUG
511   MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
512   assert(SOffset->isImm() && SOffset->getImm() == 0);
513 #endif
514 
515   assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
516          "offset should be legal");
517 
518   FIOp->ChangeToRegister(BaseReg, false);
519   OffsetOp->setImm(NewOffset);
520 }
521 
522 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
523                                         Register BaseReg,
524                                         int64_t Offset) const {
525   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
526     return false;
527 
528   int64_t NewOffset = Offset + getScratchInstrOffset(MI);
529 
530   if (SIInstrInfo::isMUBUF(*MI))
531     return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
532 
533   const SIInstrInfo *TII = ST.getInstrInfo();
534   return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS, true);
535 }
536 
537 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
538   const MachineFunction &MF, unsigned Kind) const {
539   // This is inaccurate. It depends on the instruction and address space. The
540   // only place where we should hit this is for dealing with frame indexes /
541   // private accesses, so this is correct in that case.
542   return &AMDGPU::VGPR_32RegClass;
543 }
544 
545 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
546 
547   switch (Op) {
548   case AMDGPU::SI_SPILL_S1024_SAVE:
549   case AMDGPU::SI_SPILL_S1024_RESTORE:
550   case AMDGPU::SI_SPILL_V1024_SAVE:
551   case AMDGPU::SI_SPILL_V1024_RESTORE:
552   case AMDGPU::SI_SPILL_A1024_SAVE:
553   case AMDGPU::SI_SPILL_A1024_RESTORE:
554     return 32;
555   case AMDGPU::SI_SPILL_S512_SAVE:
556   case AMDGPU::SI_SPILL_S512_RESTORE:
557   case AMDGPU::SI_SPILL_V512_SAVE:
558   case AMDGPU::SI_SPILL_V512_RESTORE:
559   case AMDGPU::SI_SPILL_A512_SAVE:
560   case AMDGPU::SI_SPILL_A512_RESTORE:
561     return 16;
562   case AMDGPU::SI_SPILL_S256_SAVE:
563   case AMDGPU::SI_SPILL_S256_RESTORE:
564   case AMDGPU::SI_SPILL_V256_SAVE:
565   case AMDGPU::SI_SPILL_V256_RESTORE:
566   case AMDGPU::SI_SPILL_A256_SAVE:
567   case AMDGPU::SI_SPILL_A256_RESTORE:
568     return 8;
569   case AMDGPU::SI_SPILL_S192_SAVE:
570   case AMDGPU::SI_SPILL_S192_RESTORE:
571   case AMDGPU::SI_SPILL_V192_SAVE:
572   case AMDGPU::SI_SPILL_V192_RESTORE:
573   case AMDGPU::SI_SPILL_A192_SAVE:
574   case AMDGPU::SI_SPILL_A192_RESTORE:
575     return 6;
576   case AMDGPU::SI_SPILL_S160_SAVE:
577   case AMDGPU::SI_SPILL_S160_RESTORE:
578   case AMDGPU::SI_SPILL_V160_SAVE:
579   case AMDGPU::SI_SPILL_V160_RESTORE:
580   case AMDGPU::SI_SPILL_A160_SAVE:
581   case AMDGPU::SI_SPILL_A160_RESTORE:
582     return 5;
583   case AMDGPU::SI_SPILL_S128_SAVE:
584   case AMDGPU::SI_SPILL_S128_RESTORE:
585   case AMDGPU::SI_SPILL_V128_SAVE:
586   case AMDGPU::SI_SPILL_V128_RESTORE:
587   case AMDGPU::SI_SPILL_A128_SAVE:
588   case AMDGPU::SI_SPILL_A128_RESTORE:
589     return 4;
590   case AMDGPU::SI_SPILL_S96_SAVE:
591   case AMDGPU::SI_SPILL_S96_RESTORE:
592   case AMDGPU::SI_SPILL_V96_SAVE:
593   case AMDGPU::SI_SPILL_V96_RESTORE:
594   case AMDGPU::SI_SPILL_A96_SAVE:
595   case AMDGPU::SI_SPILL_A96_RESTORE:
596     return 3;
597   case AMDGPU::SI_SPILL_S64_SAVE:
598   case AMDGPU::SI_SPILL_S64_RESTORE:
599   case AMDGPU::SI_SPILL_V64_SAVE:
600   case AMDGPU::SI_SPILL_V64_RESTORE:
601   case AMDGPU::SI_SPILL_A64_SAVE:
602   case AMDGPU::SI_SPILL_A64_RESTORE:
603     return 2;
604   case AMDGPU::SI_SPILL_S32_SAVE:
605   case AMDGPU::SI_SPILL_S32_RESTORE:
606   case AMDGPU::SI_SPILL_V32_SAVE:
607   case AMDGPU::SI_SPILL_V32_RESTORE:
608   case AMDGPU::SI_SPILL_A32_SAVE:
609   case AMDGPU::SI_SPILL_A32_RESTORE:
610     return 1;
611   default: llvm_unreachable("Invalid spill opcode");
612   }
613 }
614 
615 static int getOffsetMUBUFStore(unsigned Opc) {
616   switch (Opc) {
617   case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
618     return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
619   case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
620     return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
621   case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
622     return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
623   case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
624     return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
625   case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
626     return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
627   case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
628     return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
629   case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
630     return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
631   default:
632     return -1;
633   }
634 }
635 
636 static int getOffsetMUBUFLoad(unsigned Opc) {
637   switch (Opc) {
638   case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
639     return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
640   case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
641     return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
642   case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
643     return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
644   case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
645     return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
646   case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
647     return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
648   case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
649     return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
650   case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
651     return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
652   case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
653     return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
654   case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
655     return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
656   case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
657     return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
658   case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
659     return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
660   case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
661     return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
662   case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
663     return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
664   default:
665     return -1;
666   }
667 }
668 
669 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
670                                            MachineBasicBlock::iterator MI,
671                                            int Index,
672                                            unsigned Lane,
673                                            unsigned ValueReg,
674                                            bool IsKill) {
675   MachineBasicBlock *MBB = MI->getParent();
676   MachineFunction *MF = MI->getParent()->getParent();
677   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
678   const SIInstrInfo *TII = ST.getInstrInfo();
679 
680   MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
681 
682   if (Reg == AMDGPU::NoRegister)
683     return MachineInstrBuilder();
684 
685   bool IsStore = MI->mayStore();
686   MachineRegisterInfo &MRI = MF->getRegInfo();
687   auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
688 
689   unsigned Dst = IsStore ? Reg : ValueReg;
690   unsigned Src = IsStore ? ValueReg : Reg;
691   unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
692                                                    : AMDGPU::V_ACCVGPR_READ_B32_e64;
693 
694   auto MIB = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
695                .addReg(Src, getKillRegState(IsKill));
696   MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
697   return MIB;
698 }
699 
700 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
701 // need to handle the case where an SGPR may need to be spilled while spilling.
702 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
703                                       MachineFrameInfo &MFI,
704                                       MachineBasicBlock::iterator MI,
705                                       int Index,
706                                       int64_t Offset) {
707   const SIInstrInfo *TII = ST.getInstrInfo();
708   MachineBasicBlock *MBB = MI->getParent();
709   const DebugLoc &DL = MI->getDebugLoc();
710   bool IsStore = MI->mayStore();
711 
712   unsigned Opc = MI->getOpcode();
713   int LoadStoreOp = IsStore ?
714     getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
715   if (LoadStoreOp == -1)
716     return false;
717 
718   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
719   if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
720     return true;
721 
722   MachineInstrBuilder NewMI =
723       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
724           .add(*Reg)
725           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
726           .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
727           .addImm(Offset)
728           .addImm(0) // glc
729           .addImm(0) // slc
730           .addImm(0) // tfe
731           .addImm(0) // dlc
732           .addImm(0) // swz
733           .cloneMemRefs(*MI);
734 
735   const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
736                                                        AMDGPU::OpName::vdata_in);
737   if (VDataIn)
738     NewMI.add(*VDataIn);
739   return true;
740 }
741 
742 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
743                                           unsigned LoadStoreOp,
744                                           unsigned EltSize) {
745   bool IsStore = TII->get(LoadStoreOp).mayStore();
746   bool UseST =
747     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0 &&
748     AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::saddr) < 0;
749 
750   switch (EltSize) {
751   case 4:
752     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
753                           : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
754     break;
755   case 8:
756     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
757                           : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
758     break;
759   case 12:
760     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
761                           : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
762     break;
763   case 16:
764     LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
765                           : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
766     break;
767   default:
768     llvm_unreachable("Unexpected spill load/store size!");
769   }
770 
771   if (UseST)
772     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
773 
774   return LoadStoreOp;
775 }
776 
777 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
778                                          unsigned LoadStoreOp,
779                                          int Index,
780                                          Register ValueReg,
781                                          bool IsKill,
782                                          MCRegister ScratchOffsetReg,
783                                          int64_t InstOffset,
784                                          MachineMemOperand *MMO,
785                                          RegScavenger *RS) const {
786   MachineBasicBlock *MBB = MI->getParent();
787   MachineFunction *MF = MI->getParent()->getParent();
788   const SIInstrInfo *TII = ST.getInstrInfo();
789   const MachineFrameInfo &MFI = MF->getFrameInfo();
790   const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
791 
792   const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
793   const DebugLoc &DL = MI->getDebugLoc();
794   bool IsStore = Desc->mayStore();
795   bool IsFlat = TII->isFLATScratch(LoadStoreOp);
796 
797   bool Scavenged = false;
798   MCRegister SOffset = ScratchOffsetReg;
799 
800   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
801   const bool IsAGPR = hasAGPRs(RC);
802   const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
803 
804   // Always use 4 byte operations for AGPRs because we need to scavenge
805   // a temporary VGPR.
806   unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
807   unsigned NumSubRegs = RegWidth / EltSize;
808   unsigned Size = NumSubRegs * EltSize;
809   unsigned RemSize = RegWidth - Size;
810   unsigned NumRemSubRegs = RemSize ? 1 : 0;
811   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
812   int64_t MaxOffset = Offset + Size + RemSize - EltSize;
813   int64_t ScratchOffsetRegDelta = 0;
814 
815   if (IsFlat && EltSize > 4) {
816     LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
817     Desc = &TII->get(LoadStoreOp);
818   }
819 
820   Align Alignment = MFI.getObjectAlign(Index);
821   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
822 
823   assert((IsFlat || ((Offset % EltSize) == 0)) &&
824          "unexpected VGPR spill offset");
825 
826   bool IsOffsetLegal = IsFlat
827       ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS, true)
828       : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
829   if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
830     SOffset = MCRegister();
831 
832     // We currently only support spilling VGPRs to EltSize boundaries, meaning
833     // we can simplify the adjustment of Offset here to just scale with
834     // WavefrontSize.
835     if (!IsFlat)
836       Offset *= ST.getWavefrontSize();
837 
838     // We don't have access to the register scavenger if this function is called
839     // during  PEI::scavengeFrameVirtualRegs().
840     if (RS)
841       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
842 
843     if (!SOffset) {
844       // There are no free SGPRs, and since we are in the process of spilling
845       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
846       // on SI/CI and on VI it is true until we implement spilling using scalar
847       // stores), we have no way to free up an SGPR.  Our solution here is to
848       // add the offset directly to the ScratchOffset or StackPtrOffset
849       // register, and then subtract the offset after the spill to return the
850       // register to it's original value.
851       if (!ScratchOffsetReg)
852         ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
853       SOffset = ScratchOffsetReg;
854       ScratchOffsetRegDelta = Offset;
855     } else {
856       Scavenged = true;
857     }
858 
859     if (!SOffset)
860       report_fatal_error("could not scavenge SGPR to spill in entry function");
861 
862     if (ScratchOffsetReg == AMDGPU::NoRegister) {
863       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
864           .addImm(Offset);
865     } else {
866       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
867           .addReg(ScratchOffsetReg)
868           .addImm(Offset);
869     }
870 
871     Offset = 0;
872   }
873 
874   if (IsFlat && SOffset == AMDGPU::NoRegister) {
875     assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
876            && "Unexpected vaddr for flat scratch with a FI operand");
877 
878     assert(ST.hasFlatScratchSTMode());
879     LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
880     Desc = &TII->get(LoadStoreOp);
881   }
882 
883   Register TmpReg;
884 
885   for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
886        ++i, RegOffset += EltSize) {
887     if (i == NumSubRegs) {
888       EltSize = RemSize;
889       LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
890     }
891     Desc = &TII->get(LoadStoreOp);
892 
893     unsigned NumRegs = EltSize / 4;
894     Register SubReg = e == 1
895             ? ValueReg
896             : Register(getSubReg(ValueReg,
897                                  getSubRegFromChannel(RegOffset / 4, NumRegs)));
898 
899     unsigned SOffsetRegState = 0;
900     unsigned SrcDstRegState = getDefRegState(!IsStore);
901     if (i + 1 == e) {
902       SOffsetRegState |= getKillRegState(Scavenged);
903       // The last implicit use carries the "Kill" flag.
904       SrcDstRegState |= getKillRegState(IsKill);
905     }
906 
907     // Make sure the whole register is defined if there are undef components by
908     // adding an implicit def of the super-reg on the first instruction.
909     bool NeedSuperRegDef = e > 1 && IsStore && i == 0;
910     bool NeedSuperRegImpOperand = e > 1;
911 
912     unsigned Lane = RegOffset / 4;
913     unsigned LaneE = (RegOffset + EltSize) / 4;
914     for ( ; Lane != LaneE; ++Lane) {
915       bool IsSubReg = e > 1 || EltSize > 4;
916       Register Sub = IsSubReg
917              ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
918              : ValueReg;
919       auto MIB = spillVGPRtoAGPR(ST, MI, Index, Lane, Sub, IsKill);
920       if (!MIB.getInstr())
921         break;
922       if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == 0)) {
923         MIB.addReg(ValueReg, RegState::ImplicitDefine);
924         NeedSuperRegDef = false;
925       }
926       if (IsSubReg || NeedSuperRegImpOperand) {
927         NeedSuperRegImpOperand = true;
928         unsigned State = SrcDstRegState;
929         if (Lane + 1 != LaneE)
930           State &= ~RegState::Kill;
931         MIB.addReg(ValueReg, RegState::Implicit | State);
932       }
933     }
934 
935     if (Lane == LaneE) // Fully spilled into AGPRs.
936       continue;
937 
938     // Offset in bytes from the beginning of the ValueReg to its portion we
939     // still need to spill. It may differ from RegOffset if a portion of
940     // current SubReg has been already spilled into AGPRs by the loop above.
941     unsigned RemRegOffset = Lane * 4;
942     unsigned RemEltSize = EltSize - (RemRegOffset - RegOffset);
943     if (RemEltSize != EltSize) { // Partially spilled to AGPRs
944       assert(IsFlat && EltSize > 4);
945 
946       unsigned NumRegs = RemEltSize / 4;
947       SubReg = Register(getSubReg(ValueReg,
948                         getSubRegFromChannel(RemRegOffset / 4, NumRegs)));
949       unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
950       Desc = &TII->get(Opc);
951     }
952 
953     unsigned FinalReg = SubReg;
954 
955     if (IsAGPR) {
956       assert(EltSize == 4);
957 
958       if (!TmpReg) {
959         assert(RS && "Needs to have RegScavenger to spill an AGPR!");
960         // FIXME: change to scavengeRegisterBackwards()
961         TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
962         RS->setRegUsed(TmpReg);
963       }
964       if (IsStore) {
965         auto AccRead = BuildMI(*MBB, MI, DL,
966                               TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TmpReg)
967           .addReg(SubReg, getKillRegState(IsKill));
968         if (NeedSuperRegDef)
969           AccRead.addReg(ValueReg, RegState::ImplicitDefine);
970         AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
971       }
972       SubReg = TmpReg;
973     }
974 
975     MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RemRegOffset);
976     MachineMemOperand *NewMMO =
977         MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
978                                  commonAlignment(Alignment, RemRegOffset));
979 
980     auto MIB = BuildMI(*MBB, MI, DL, *Desc)
981                   .addReg(SubReg,
982                           getDefRegState(!IsStore) | getKillRegState(IsKill));
983     if (!IsFlat)
984       MIB.addReg(FuncInfo->getScratchRSrcReg());
985 
986     if (SOffset == AMDGPU::NoRegister) {
987       if (!IsFlat)
988         MIB.addImm(0);
989     } else {
990       MIB.addReg(SOffset, SOffsetRegState);
991     }
992     MIB.addImm(Offset + RemRegOffset)
993         .addImm(0) // glc
994         .addImm(0) // slc
995         .addImm(0); // tfe for MUBUF or dlc for FLAT
996     if (!IsFlat)
997       MIB.addImm(0) // dlc
998          .addImm(0); // swz
999     MIB.addMemOperand(NewMMO);
1000 
1001     if (!IsAGPR && NeedSuperRegDef)
1002       MIB.addReg(ValueReg, RegState::ImplicitDefine);
1003 
1004     if (!IsStore && TmpReg != AMDGPU::NoRegister) {
1005       MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1006                     FinalReg)
1007         .addReg(TmpReg, RegState::Kill);
1008       MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1009     }
1010 
1011     if (NeedSuperRegImpOperand)
1012       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1013   }
1014 
1015   if (ScratchOffsetRegDelta != 0) {
1016     // Subtract the offset we added to the ScratchOffset register.
1017     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
1018         .addReg(SOffset)
1019         .addImm(ScratchOffsetRegDelta);
1020   }
1021 }
1022 
1023 // Generate a VMEM access which loads or stores the VGPR containing an SGPR
1024 // spill such that all the lanes set in VGPRLanes are loaded or stored.
1025 // This generates exec mask manipulation and will use SGPRs available in MI
1026 // or VGPR lanes in the VGPR to save and restore the exec mask.
1027 void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
1028                                              int Index, int Offset,
1029                                              unsigned EltSize, Register VGPR,
1030                                              int64_t VGPRLanes,
1031                                              RegScavenger *RS,
1032                                              bool IsLoad) const {
1033   MachineBasicBlock *MBB = MI->getParent();
1034   MachineFunction *MF = MBB->getParent();
1035   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1036   const SIInstrInfo *TII = ST.getInstrInfo();
1037 
1038   Register SuperReg = MI->getOperand(0).getReg();
1039   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1040   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1041   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1042   unsigned FirstPart = Offset * 32;
1043   unsigned ExecLane = 0;
1044 
1045   bool IsKill = MI->getOperand(0).isKill();
1046   const DebugLoc &DL = MI->getDebugLoc();
1047 
1048   // Cannot handle load/store to EXEC
1049   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1050          SuperReg != AMDGPU::EXEC && "exec should never spill");
1051 
1052   // On Wave32 only handle EXEC_LO.
1053   // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
1054   bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
1055 
1056   unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1057   Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
1058   Register SavedExecReg;
1059 
1060   // Backup EXEC
1061   if (OnlyExecLo) {
1062     SavedExecReg =
1063         NumSubRegs == 1
1064             ? SuperReg
1065             : Register(getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]));
1066   } else {
1067     // If src/dst is an odd size it is possible subreg0 is not aligned.
1068     for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
1069       SavedExecReg = getMatchingSuperReg(
1070           getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
1071           &AMDGPU::SReg_64_XEXECRegClass);
1072       if (SavedExecReg)
1073         break;
1074     }
1075   }
1076   assert(SavedExecReg);
1077   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
1078 
1079   // Setup EXEC
1080   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
1081 
1082   // Load/store VGPR
1083   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1084   assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1085 
1086   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1087                           ? getBaseRegister()
1088                           : getFrameRegister(*MF);
1089 
1090   Align Alignment = FrameInfo.getObjectAlign(Index);
1091   MachinePointerInfo PtrInfo =
1092       MachinePointerInfo::getFixedStack(*MF, Index);
1093   MachineMemOperand *MMO = MF->getMachineMemOperand(
1094       PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1095       EltSize, Alignment);
1096 
1097   if (IsLoad) {
1098     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1099                                           : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1100     buildSpillLoadStore(MI, Opc,
1101           Index,
1102           VGPR, false,
1103           FrameReg,
1104           Offset * EltSize, MMO,
1105           RS);
1106   } else {
1107     unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1108                                           : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1109     buildSpillLoadStore(MI, Opc, Index, VGPR,
1110                         IsKill, FrameReg,
1111                         Offset * EltSize, MMO, RS);
1112     // This only ever adds one VGPR spill
1113     MFI->addToSpilledVGPRs(1);
1114   }
1115 
1116   // Restore EXEC
1117   BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
1118       .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
1119 
1120   // Restore clobbered SGPRs
1121   if (IsLoad) {
1122     // Nothing to do; register will be overwritten
1123   } else if (!IsKill) {
1124     // Restore SGPRs from appropriate VGPR lanes
1125     if (!OnlyExecLo) {
1126       BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1127               getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
1128           .addReg(VGPR)
1129           .addImm(ExecLane + 1);
1130     }
1131     BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
1132             NumSubRegs == 1 ? SavedExecReg
1133                             : Register(getSubReg(
1134                                   SuperReg, SplitParts[FirstPart + ExecLane])))
1135         .addReg(VGPR, RegState::Kill)
1136         .addImm(ExecLane);
1137   }
1138 }
1139 
1140 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
1141                                int Index,
1142                                RegScavenger *RS,
1143                                bool OnlyToVGPR) const {
1144   MachineBasicBlock *MBB = MI->getParent();
1145   MachineFunction *MF = MBB->getParent();
1146   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1147 
1148   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1149     = MFI->getSGPRToVGPRSpills(Index);
1150   bool SpillToVGPR = !VGPRSpills.empty();
1151   if (OnlyToVGPR && !SpillToVGPR)
1152     return false;
1153 
1154   const SIInstrInfo *TII = ST.getInstrInfo();
1155 
1156   Register SuperReg = MI->getOperand(0).getReg();
1157   bool IsKill = MI->getOperand(0).isKill();
1158   const DebugLoc &DL = MI->getDebugLoc();
1159 
1160   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
1161                          SuperReg != MFI->getFrameOffsetReg()));
1162 
1163   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1164   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1165          SuperReg != AMDGPU::EXEC && "exec should never spill");
1166 
1167   unsigned EltSize = 4;
1168   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1169 
1170   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1171   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1172 
1173   if (SpillToVGPR) {
1174     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1175       Register SubReg = NumSubRegs == 1
1176                             ? SuperReg
1177                             : Register(getSubReg(SuperReg, SplitParts[i]));
1178       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1179 
1180       bool UseKill = IsKill && i == NumSubRegs - 1;
1181 
1182       // Mark the "old value of vgpr" input undef only if this is the first sgpr
1183       // spill to this specific vgpr in the first basic block.
1184       auto MIB =
1185           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
1186               .addReg(SubReg, getKillRegState(UseKill))
1187               .addImm(Spill.Lane)
1188               .addReg(Spill.VGPR);
1189 
1190       if (i == 0 && NumSubRegs > 1) {
1191         // We may be spilling a super-register which is only partially defined,
1192         // and need to ensure later spills think the value is defined.
1193         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1194       }
1195 
1196       if (NumSubRegs > 1)
1197         MIB.addReg(SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1198 
1199       // FIXME: Since this spills to another register instead of an actual
1200       // frame index, we should delete the frame index when all references to
1201       // it are fixed.
1202     }
1203   } else {
1204     // Scavenged temporary VGPR to use. It must be scavenged once for any number
1205     // of spilled subregs.
1206     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1207     RS->setRegUsed(TmpVGPR);
1208 
1209     // SubReg carries the "Kill" flag when SubReg == SuperReg.
1210     unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
1211 
1212     unsigned PerVGPR = 32;
1213     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1214     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1215 
1216     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1217       unsigned TmpVGPRFlags = RegState::Undef;
1218 
1219       // Write sub registers into the VGPR
1220       for (unsigned i = Offset * PerVGPR,
1221                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1222            i < e; ++i) {
1223         Register SubReg = NumSubRegs == 1
1224                               ? SuperReg
1225                               : Register(getSubReg(SuperReg, SplitParts[i]));
1226 
1227         MachineInstrBuilder WriteLane =
1228             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), TmpVGPR)
1229                 .addReg(SubReg, SubKillState)
1230                 .addImm(i % PerVGPR)
1231                 .addReg(TmpVGPR, TmpVGPRFlags);
1232         TmpVGPRFlags = 0;
1233 
1234         // There could be undef components of a spilled super register.
1235         // TODO: Can we detect this and skip the spill?
1236         if (NumSubRegs > 1) {
1237           // The last implicit use of the SuperReg carries the "Kill" flag.
1238           unsigned SuperKillState = 0;
1239           if (i + 1 == NumSubRegs)
1240             SuperKillState |= getKillRegState(IsKill);
1241           WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
1242         }
1243       }
1244 
1245       // Write out VGPR
1246       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1247                               RS, false);
1248     }
1249   }
1250 
1251   MI->eraseFromParent();
1252   MFI->addToSpilledSGPRs(NumSubRegs);
1253   return true;
1254 }
1255 
1256 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
1257                                  int Index,
1258                                  RegScavenger *RS,
1259                                  bool OnlyToVGPR) const {
1260   MachineFunction *MF = MI->getParent()->getParent();
1261   MachineBasicBlock *MBB = MI->getParent();
1262   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1263 
1264   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
1265     = MFI->getSGPRToVGPRSpills(Index);
1266   bool SpillToVGPR = !VGPRSpills.empty();
1267   if (OnlyToVGPR && !SpillToVGPR)
1268     return false;
1269 
1270   const SIInstrInfo *TII = ST.getInstrInfo();
1271   const DebugLoc &DL = MI->getDebugLoc();
1272 
1273   Register SuperReg = MI->getOperand(0).getReg();
1274 
1275   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
1276   assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
1277          SuperReg != AMDGPU::EXEC && "exec should never spill");
1278 
1279   unsigned EltSize = 4;
1280 
1281   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
1282 
1283   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
1284   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
1285 
1286   if (SpillToVGPR) {
1287     for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
1288       Register SubReg = NumSubRegs == 1
1289                             ? SuperReg
1290                             : Register(getSubReg(SuperReg, SplitParts[i]));
1291 
1292       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
1293       auto MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1294                      .addReg(Spill.VGPR)
1295                      .addImm(Spill.Lane);
1296       if (NumSubRegs > 1 && i == 0)
1297         MIB.addReg(SuperReg, RegState::ImplicitDefine);
1298     }
1299   } else {
1300     Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1301     RS->setRegUsed(TmpVGPR);
1302 
1303     unsigned PerVGPR = 32;
1304     unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
1305     int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
1306 
1307     for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
1308       // Load in VGPR data
1309       buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
1310                               RS, true);
1311 
1312       // Unpack lanes
1313       for (unsigned i = Offset * PerVGPR,
1314                     e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
1315            i < e; ++i) {
1316         Register SubReg = NumSubRegs == 1
1317                               ? SuperReg
1318                               : Register(getSubReg(SuperReg, SplitParts[i]));
1319 
1320         bool LastSubReg = (i + 1 == e);
1321         auto MIB =
1322             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
1323                 .addReg(TmpVGPR, getKillRegState(LastSubReg))
1324                 .addImm(i);
1325         if (NumSubRegs > 1 && i == 0)
1326           MIB.addReg(SuperReg, RegState::ImplicitDefine);
1327       }
1328     }
1329   }
1330 
1331   MI->eraseFromParent();
1332   return true;
1333 }
1334 
1335 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
1336 /// a VGPR and the stack slot can be safely eliminated when all other users are
1337 /// handled.
1338 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
1339   MachineBasicBlock::iterator MI,
1340   int FI,
1341   RegScavenger *RS) const {
1342   switch (MI->getOpcode()) {
1343   case AMDGPU::SI_SPILL_S1024_SAVE:
1344   case AMDGPU::SI_SPILL_S512_SAVE:
1345   case AMDGPU::SI_SPILL_S256_SAVE:
1346   case AMDGPU::SI_SPILL_S192_SAVE:
1347   case AMDGPU::SI_SPILL_S160_SAVE:
1348   case AMDGPU::SI_SPILL_S128_SAVE:
1349   case AMDGPU::SI_SPILL_S96_SAVE:
1350   case AMDGPU::SI_SPILL_S64_SAVE:
1351   case AMDGPU::SI_SPILL_S32_SAVE:
1352     return spillSGPR(MI, FI, RS, true);
1353   case AMDGPU::SI_SPILL_S1024_RESTORE:
1354   case AMDGPU::SI_SPILL_S512_RESTORE:
1355   case AMDGPU::SI_SPILL_S256_RESTORE:
1356   case AMDGPU::SI_SPILL_S192_RESTORE:
1357   case AMDGPU::SI_SPILL_S160_RESTORE:
1358   case AMDGPU::SI_SPILL_S128_RESTORE:
1359   case AMDGPU::SI_SPILL_S96_RESTORE:
1360   case AMDGPU::SI_SPILL_S64_RESTORE:
1361   case AMDGPU::SI_SPILL_S32_RESTORE:
1362     return restoreSGPR(MI, FI, RS, true);
1363   default:
1364     llvm_unreachable("not an SGPR spill instruction");
1365   }
1366 }
1367 
1368 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
1369                                         int SPAdj, unsigned FIOperandNum,
1370                                         RegScavenger *RS) const {
1371   MachineFunction *MF = MI->getParent()->getParent();
1372   MachineBasicBlock *MBB = MI->getParent();
1373   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1374   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
1375   const SIInstrInfo *TII = ST.getInstrInfo();
1376   DebugLoc DL = MI->getDebugLoc();
1377 
1378   assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
1379 
1380   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
1381   int Index = MI->getOperand(FIOperandNum).getIndex();
1382 
1383   Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
1384                           ? getBaseRegister()
1385                           : getFrameRegister(*MF);
1386 
1387   switch (MI->getOpcode()) {
1388     // SGPR register spill
1389     case AMDGPU::SI_SPILL_S1024_SAVE:
1390     case AMDGPU::SI_SPILL_S512_SAVE:
1391     case AMDGPU::SI_SPILL_S256_SAVE:
1392     case AMDGPU::SI_SPILL_S192_SAVE:
1393     case AMDGPU::SI_SPILL_S160_SAVE:
1394     case AMDGPU::SI_SPILL_S128_SAVE:
1395     case AMDGPU::SI_SPILL_S96_SAVE:
1396     case AMDGPU::SI_SPILL_S64_SAVE:
1397     case AMDGPU::SI_SPILL_S32_SAVE: {
1398       spillSGPR(MI, Index, RS);
1399       break;
1400     }
1401 
1402     // SGPR register restore
1403     case AMDGPU::SI_SPILL_S1024_RESTORE:
1404     case AMDGPU::SI_SPILL_S512_RESTORE:
1405     case AMDGPU::SI_SPILL_S256_RESTORE:
1406     case AMDGPU::SI_SPILL_S192_RESTORE:
1407     case AMDGPU::SI_SPILL_S160_RESTORE:
1408     case AMDGPU::SI_SPILL_S128_RESTORE:
1409     case AMDGPU::SI_SPILL_S96_RESTORE:
1410     case AMDGPU::SI_SPILL_S64_RESTORE:
1411     case AMDGPU::SI_SPILL_S32_RESTORE: {
1412       restoreSGPR(MI, Index, RS);
1413       break;
1414     }
1415 
1416     // VGPR register spill
1417     case AMDGPU::SI_SPILL_V1024_SAVE:
1418     case AMDGPU::SI_SPILL_V512_SAVE:
1419     case AMDGPU::SI_SPILL_V256_SAVE:
1420     case AMDGPU::SI_SPILL_V192_SAVE:
1421     case AMDGPU::SI_SPILL_V160_SAVE:
1422     case AMDGPU::SI_SPILL_V128_SAVE:
1423     case AMDGPU::SI_SPILL_V96_SAVE:
1424     case AMDGPU::SI_SPILL_V64_SAVE:
1425     case AMDGPU::SI_SPILL_V32_SAVE:
1426     case AMDGPU::SI_SPILL_A1024_SAVE:
1427     case AMDGPU::SI_SPILL_A512_SAVE:
1428     case AMDGPU::SI_SPILL_A256_SAVE:
1429     case AMDGPU::SI_SPILL_A192_SAVE:
1430     case AMDGPU::SI_SPILL_A160_SAVE:
1431     case AMDGPU::SI_SPILL_A128_SAVE:
1432     case AMDGPU::SI_SPILL_A96_SAVE:
1433     case AMDGPU::SI_SPILL_A64_SAVE:
1434     case AMDGPU::SI_SPILL_A32_SAVE: {
1435       const MachineOperand *VData = TII->getNamedOperand(*MI,
1436                                                          AMDGPU::OpName::vdata);
1437       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1438              MFI->getStackPtrOffsetReg());
1439 
1440       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1441                                             : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1442       buildSpillLoadStore(MI, Opc,
1443             Index,
1444             VData->getReg(), VData->isKill(),
1445             FrameReg,
1446             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1447             *MI->memoperands_begin(),
1448             RS);
1449       MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
1450       MI->eraseFromParent();
1451       break;
1452     }
1453     case AMDGPU::SI_SPILL_V32_RESTORE:
1454     case AMDGPU::SI_SPILL_V64_RESTORE:
1455     case AMDGPU::SI_SPILL_V96_RESTORE:
1456     case AMDGPU::SI_SPILL_V128_RESTORE:
1457     case AMDGPU::SI_SPILL_V160_RESTORE:
1458     case AMDGPU::SI_SPILL_V192_RESTORE:
1459     case AMDGPU::SI_SPILL_V256_RESTORE:
1460     case AMDGPU::SI_SPILL_V512_RESTORE:
1461     case AMDGPU::SI_SPILL_V1024_RESTORE:
1462     case AMDGPU::SI_SPILL_A32_RESTORE:
1463     case AMDGPU::SI_SPILL_A64_RESTORE:
1464     case AMDGPU::SI_SPILL_A96_RESTORE:
1465     case AMDGPU::SI_SPILL_A128_RESTORE:
1466     case AMDGPU::SI_SPILL_A160_RESTORE:
1467     case AMDGPU::SI_SPILL_A192_RESTORE:
1468     case AMDGPU::SI_SPILL_A256_RESTORE:
1469     case AMDGPU::SI_SPILL_A512_RESTORE:
1470     case AMDGPU::SI_SPILL_A1024_RESTORE: {
1471       const MachineOperand *VData = TII->getNamedOperand(*MI,
1472                                                          AMDGPU::OpName::vdata);
1473       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
1474              MFI->getStackPtrOffsetReg());
1475 
1476       unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1477                                             : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1478       buildSpillLoadStore(MI, Opc,
1479             Index,
1480             VData->getReg(), VData->isKill(),
1481             FrameReg,
1482             TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
1483             *MI->memoperands_begin(),
1484             RS);
1485       MI->eraseFromParent();
1486       break;
1487     }
1488 
1489     default: {
1490       const DebugLoc &DL = MI->getDebugLoc();
1491 
1492       int64_t Offset = FrameInfo.getObjectOffset(Index);
1493       if (ST.enableFlatScratch()) {
1494         if (TII->isFLATScratch(*MI)) {
1495           assert((int16_t)FIOperandNum ==
1496                  AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1497                                             AMDGPU::OpName::saddr));
1498 
1499           // The offset is always swizzled, just replace it
1500           if (FrameReg)
1501             FIOp.ChangeToRegister(FrameReg, false);
1502 
1503           if (!Offset)
1504             return;
1505 
1506           MachineOperand *OffsetOp =
1507             TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1508           int64_t NewOffset = Offset + OffsetOp->getImm();
1509           if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
1510                                      true)) {
1511             OffsetOp->setImm(NewOffset);
1512             if (FrameReg)
1513               return;
1514             Offset = 0;
1515           }
1516 
1517           assert(!TII->getNamedOperand(*MI, AMDGPU::OpName::vaddr) &&
1518                  "Unexpected vaddr for flat scratch with a FI operand");
1519 
1520           // On GFX10 we have ST mode to use no registers for an address.
1521           // Otherwise we need to materialize 0 into an SGPR.
1522           if (!Offset && ST.hasFlatScratchSTMode()) {
1523             unsigned Opc = MI->getOpcode();
1524             unsigned NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
1525             MI->RemoveOperand(
1526                 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
1527             MI->setDesc(TII->get(NewOpc));
1528             return;
1529           }
1530         }
1531 
1532         if (!FrameReg) {
1533           FIOp.ChangeToImmediate(Offset);
1534           if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
1535             return;
1536         }
1537 
1538         // We need to use register here. Check if we can use an SGPR or need
1539         // a VGPR.
1540         FIOp.ChangeToRegister(AMDGPU::M0, false);
1541         bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
1542 
1543         if (!Offset && FrameReg && UseSGPR) {
1544           FIOp.setReg(FrameReg);
1545           return;
1546         }
1547 
1548         const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
1549                                                 : &AMDGPU::VGPR_32RegClass;
1550 
1551         Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
1552         FIOp.setReg(TmpReg);
1553         FIOp.setIsKill(true);
1554 
1555         if ((!FrameReg || !Offset) && TmpReg) {
1556           unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
1557           auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
1558           if (FrameReg)
1559             MIB.addReg(FrameReg);
1560           else
1561             MIB.addImm(Offset);
1562 
1563           return;
1564         }
1565 
1566         Register TmpSReg =
1567             UseSGPR ? TmpReg
1568                     : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
1569                                            !UseSGPR);
1570 
1571         // TODO: for flat scratch another attempt can be made with a VGPR index
1572         //       if no SGPRs can be scavenged.
1573         if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
1574           report_fatal_error("Cannot scavenge register in FI elimination!");
1575 
1576         if (!TmpSReg) {
1577           // Use frame register and restore it after.
1578           TmpSReg = FrameReg;
1579           FIOp.setReg(FrameReg);
1580           FIOp.setIsKill(false);
1581         }
1582 
1583         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), TmpSReg)
1584           .addReg(FrameReg)
1585           .addImm(Offset);
1586 
1587         if (!UseSGPR)
1588           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1589             .addReg(TmpSReg, RegState::Kill);
1590 
1591         if (TmpSReg == FrameReg) {
1592           // Undo frame register modification.
1593           BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_SUB_U32),
1594                   FrameReg)
1595             .addReg(FrameReg)
1596             .addImm(Offset);
1597         }
1598 
1599         return;
1600       }
1601 
1602       bool IsMUBUF = TII->isMUBUF(*MI);
1603 
1604       if (!IsMUBUF && !MFI->isEntryFunction()) {
1605         // Convert to a swizzled stack address by scaling by the wave size.
1606         //
1607         // In an entry function/kernel the offset is already swizzled.
1608 
1609         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
1610         Register ResultReg =
1611             IsCopy ? MI->getOperand(0).getReg()
1612                    : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1613 
1614         int64_t Offset = FrameInfo.getObjectOffset(Index);
1615         if (Offset == 0) {
1616           // XXX - This never happens because of emergency scavenging slot at 0?
1617           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
1618             .addImm(ST.getWavefrontSizeLog2())
1619             .addReg(FrameReg);
1620         } else {
1621           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
1622             // Reuse ResultReg in intermediate step.
1623             Register ScaledReg = ResultReg;
1624 
1625             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
1626                     ScaledReg)
1627               .addImm(ST.getWavefrontSizeLog2())
1628               .addReg(FrameReg);
1629 
1630             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
1631 
1632             // TODO: Fold if use instruction is another add of a constant.
1633             if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
1634               // FIXME: This can fail
1635               MIB.addImm(Offset);
1636               MIB.addReg(ScaledReg, RegState::Kill);
1637               if (!IsVOP2)
1638                 MIB.addImm(0); // clamp bit
1639             } else {
1640               assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
1641                      "Need to reuse carry out register");
1642 
1643               // Use scavenged unused carry out as offset register.
1644               Register ConstOffsetReg;
1645               if (!isWave32)
1646                 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
1647               else
1648                 ConstOffsetReg = MIB.getReg(1);
1649 
1650               BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
1651                 .addImm(Offset);
1652               MIB.addReg(ConstOffsetReg, RegState::Kill);
1653               MIB.addReg(ScaledReg, RegState::Kill);
1654               MIB.addImm(0); // clamp bit
1655             }
1656           } else {
1657             // We have to produce a carry out, and there isn't a free SGPR pair
1658             // for it. We can keep the whole computation on the SALU to avoid
1659             // clobbering an additional register at the cost of an extra mov.
1660 
1661             // We may have 1 free scratch SGPR even though a carry out is
1662             // unavailable. Only one additional mov is needed.
1663             Register TmpScaledReg =
1664                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
1665             Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
1666 
1667             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
1668               .addReg(FrameReg)
1669               .addImm(ST.getWavefrontSizeLog2());
1670             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
1671               .addReg(ScaledReg, RegState::Kill)
1672               .addImm(Offset);
1673             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
1674               .addReg(ScaledReg, RegState::Kill);
1675 
1676             // If there were truly no free SGPRs, we need to undo everything.
1677             if (!TmpScaledReg.isValid()) {
1678               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
1679                 .addReg(ScaledReg, RegState::Kill)
1680                 .addImm(Offset);
1681               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
1682                 .addReg(FrameReg)
1683                 .addImm(ST.getWavefrontSizeLog2());
1684             }
1685           }
1686         }
1687 
1688         // Don't introduce an extra copy if we're just materializing in a mov.
1689         if (IsCopy)
1690           MI->eraseFromParent();
1691         else
1692           FIOp.ChangeToRegister(ResultReg, false, false, true);
1693         return;
1694       }
1695 
1696       if (IsMUBUF) {
1697         // Disable offen so we don't need a 0 vgpr base.
1698         assert(static_cast<int>(FIOperandNum) ==
1699                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1700                                           AMDGPU::OpName::vaddr));
1701 
1702         auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
1703         assert((SOffset.isImm() && SOffset.getImm() == 0));
1704 
1705         if (FrameReg != AMDGPU::NoRegister)
1706           SOffset.ChangeToRegister(FrameReg, false);
1707 
1708         int64_t Offset = FrameInfo.getObjectOffset(Index);
1709         int64_t OldImm
1710           = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
1711         int64_t NewOffset = OldImm + Offset;
1712 
1713         if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
1714             buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
1715           MI->eraseFromParent();
1716           return;
1717         }
1718       }
1719 
1720       // If the offset is simply too big, don't convert to a scratch wave offset
1721       // relative index.
1722 
1723       FIOp.ChangeToImmediate(Offset);
1724       if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
1725         Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
1726         BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
1727           .addImm(Offset);
1728         FIOp.ChangeToRegister(TmpReg, false, false, true);
1729       }
1730     }
1731   }
1732 }
1733 
1734 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
1735   return AMDGPUInstPrinter::getRegisterName(Reg);
1736 }
1737 
1738 const TargetRegisterClass *
1739 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
1740   if (BitWidth == 1)
1741     return &AMDGPU::VReg_1RegClass;
1742   if (BitWidth <= 16)
1743     return &AMDGPU::VGPR_LO16RegClass;
1744   if (BitWidth <= 32)
1745     return &AMDGPU::VGPR_32RegClass;
1746   if (BitWidth <= 64)
1747     return &AMDGPU::VReg_64RegClass;
1748   if (BitWidth <= 96)
1749     return &AMDGPU::VReg_96RegClass;
1750   if (BitWidth <= 128)
1751     return &AMDGPU::VReg_128RegClass;
1752   if (BitWidth <= 160)
1753     return &AMDGPU::VReg_160RegClass;
1754   if (BitWidth <= 192)
1755     return &AMDGPU::VReg_192RegClass;
1756   if (BitWidth <= 256)
1757     return &AMDGPU::VReg_256RegClass;
1758   if (BitWidth <= 512)
1759     return &AMDGPU::VReg_512RegClass;
1760   if (BitWidth <= 1024)
1761     return &AMDGPU::VReg_1024RegClass;
1762 
1763   return nullptr;
1764 }
1765 
1766 const TargetRegisterClass *
1767 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
1768   if (BitWidth <= 16)
1769     return &AMDGPU::AGPR_LO16RegClass;
1770   if (BitWidth <= 32)
1771     return &AMDGPU::AGPR_32RegClass;
1772   if (BitWidth <= 64)
1773     return &AMDGPU::AReg_64RegClass;
1774   if (BitWidth <= 96)
1775     return &AMDGPU::AReg_96RegClass;
1776   if (BitWidth <= 128)
1777     return &AMDGPU::AReg_128RegClass;
1778   if (BitWidth <= 160)
1779     return &AMDGPU::AReg_160RegClass;
1780   if (BitWidth <= 192)
1781     return &AMDGPU::AReg_192RegClass;
1782   if (BitWidth <= 256)
1783     return &AMDGPU::AReg_256RegClass;
1784   if (BitWidth <= 512)
1785     return &AMDGPU::AReg_512RegClass;
1786   if (BitWidth <= 1024)
1787     return &AMDGPU::AReg_1024RegClass;
1788 
1789   return nullptr;
1790 }
1791 
1792 const TargetRegisterClass *
1793 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
1794   if (BitWidth <= 16)
1795     return &AMDGPU::SGPR_LO16RegClass;
1796   if (BitWidth <= 32)
1797     return &AMDGPU::SReg_32RegClass;
1798   if (BitWidth <= 64)
1799     return &AMDGPU::SReg_64RegClass;
1800   if (BitWidth <= 96)
1801     return &AMDGPU::SGPR_96RegClass;
1802   if (BitWidth <= 128)
1803     return &AMDGPU::SGPR_128RegClass;
1804   if (BitWidth <= 160)
1805     return &AMDGPU::SGPR_160RegClass;
1806   if (BitWidth <= 192)
1807     return &AMDGPU::SGPR_192RegClass;
1808   if (BitWidth <= 256)
1809     return &AMDGPU::SGPR_256RegClass;
1810   if (BitWidth <= 512)
1811     return &AMDGPU::SGPR_512RegClass;
1812   if (BitWidth <= 1024)
1813     return &AMDGPU::SGPR_1024RegClass;
1814 
1815   return nullptr;
1816 }
1817 
1818 // FIXME: This is very slow. It might be worth creating a map from physreg to
1819 // register class.
1820 const TargetRegisterClass *
1821 SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
1822   static const TargetRegisterClass *const BaseClasses[] = {
1823     &AMDGPU::VGPR_LO16RegClass,
1824     &AMDGPU::VGPR_HI16RegClass,
1825     &AMDGPU::SReg_LO16RegClass,
1826     &AMDGPU::AGPR_LO16RegClass,
1827     &AMDGPU::VGPR_32RegClass,
1828     &AMDGPU::SReg_32RegClass,
1829     &AMDGPU::AGPR_32RegClass,
1830     &AMDGPU::VReg_64RegClass,
1831     &AMDGPU::SReg_64RegClass,
1832     &AMDGPU::AReg_64RegClass,
1833     &AMDGPU::VReg_96RegClass,
1834     &AMDGPU::SReg_96RegClass,
1835     &AMDGPU::AReg_96RegClass,
1836     &AMDGPU::VReg_128RegClass,
1837     &AMDGPU::SReg_128RegClass,
1838     &AMDGPU::AReg_128RegClass,
1839     &AMDGPU::VReg_160RegClass,
1840     &AMDGPU::SReg_160RegClass,
1841     &AMDGPU::AReg_160RegClass,
1842     &AMDGPU::VReg_192RegClass,
1843     &AMDGPU::SReg_192RegClass,
1844     &AMDGPU::AReg_192RegClass,
1845     &AMDGPU::VReg_256RegClass,
1846     &AMDGPU::SReg_256RegClass,
1847     &AMDGPU::AReg_256RegClass,
1848     &AMDGPU::VReg_512RegClass,
1849     &AMDGPU::SReg_512RegClass,
1850     &AMDGPU::AReg_512RegClass,
1851     &AMDGPU::SReg_1024RegClass,
1852     &AMDGPU::VReg_1024RegClass,
1853     &AMDGPU::AReg_1024RegClass,
1854     &AMDGPU::SCC_CLASSRegClass,
1855     &AMDGPU::Pseudo_SReg_32RegClass,
1856     &AMDGPU::Pseudo_SReg_128RegClass,
1857   };
1858 
1859   for (const TargetRegisterClass *BaseClass : BaseClasses) {
1860     if (BaseClass->contains(Reg)) {
1861       return BaseClass;
1862     }
1863   }
1864   return nullptr;
1865 }
1866 
1867 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
1868                                Register Reg) const {
1869   const TargetRegisterClass *RC;
1870   if (Reg.isVirtual())
1871     RC = MRI.getRegClass(Reg);
1872   else
1873     RC = getPhysRegClass(Reg);
1874   return isSGPRClass(RC);
1875 }
1876 
1877 // TODO: It might be helpful to have some target specific flags in
1878 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
1879 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
1880   unsigned Size = getRegSizeInBits(*RC);
1881   if (Size == 16) {
1882     return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
1883            getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
1884   }
1885   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1886   if (!VRC) {
1887     assert(Size < 32 && "Invalid register class size");
1888     return false;
1889   }
1890   return getCommonSubClass(VRC, RC) != nullptr;
1891 }
1892 
1893 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
1894   unsigned Size = getRegSizeInBits(*RC);
1895   if (Size < 16)
1896     return false;
1897   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1898   if (!ARC) {
1899     assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
1900     return false;
1901   }
1902   return getCommonSubClass(ARC, RC) != nullptr;
1903 }
1904 
1905 const TargetRegisterClass *
1906 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
1907   unsigned Size = getRegSizeInBits(*SRC);
1908   const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
1909   assert(VRC && "Invalid register class size");
1910   return VRC;
1911 }
1912 
1913 const TargetRegisterClass *
1914 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
1915   unsigned Size = getRegSizeInBits(*SRC);
1916   const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
1917   assert(ARC && "Invalid register class size");
1918   return ARC;
1919 }
1920 
1921 const TargetRegisterClass *
1922 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
1923   unsigned Size = getRegSizeInBits(*VRC);
1924   if (Size == 32)
1925     return &AMDGPU::SGPR_32RegClass;
1926   const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
1927   assert(SRC && "Invalid register class size");
1928   return SRC;
1929 }
1930 
1931 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
1932                          const TargetRegisterClass *RC, unsigned SubIdx) const {
1933   if (SubIdx == AMDGPU::NoSubRegister)
1934     return RC;
1935 
1936   // We can assume that each lane corresponds to one 32-bit register.
1937   unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
1938   if (isSGPRClass(RC)) {
1939     if (Size == 32)
1940       RC = &AMDGPU::SGPR_32RegClass;
1941     else
1942       RC = getSGPRClassForBitWidth(Size);
1943   } else if (hasAGPRs(RC)) {
1944     RC = getAGPRClassForBitWidth(Size);
1945   } else {
1946     RC = getVGPRClassForBitWidth(Size);
1947   }
1948   assert(RC && "Invalid sub-register class size");
1949   return RC;
1950 }
1951 
1952 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
1953   if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
1954       OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
1955     return !ST.hasMFMAInlineLiteralBug();
1956 
1957   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
1958          OpType <= AMDGPU::OPERAND_SRC_LAST;
1959 }
1960 
1961 bool SIRegisterInfo::shouldRewriteCopySrc(
1962   const TargetRegisterClass *DefRC,
1963   unsigned DefSubReg,
1964   const TargetRegisterClass *SrcRC,
1965   unsigned SrcSubReg) const {
1966   // We want to prefer the smallest register class possible, so we don't want to
1967   // stop and rewrite on anything that looks like a subregister
1968   // extract. Operations mostly don't care about the super register class, so we
1969   // only want to stop on the most basic of copies between the same register
1970   // class.
1971   //
1972   // e.g. if we have something like
1973   // %0 = ...
1974   // %1 = ...
1975   // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
1976   // %3 = COPY %2, sub0
1977   //
1978   // We want to look through the COPY to find:
1979   //  => %3 = COPY %0
1980 
1981   // Plain copy.
1982   return getCommonSubClass(DefRC, SrcRC) != nullptr;
1983 }
1984 
1985 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
1986   // TODO: 64-bit operands have extending behavior from 32-bit literal.
1987   return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
1988          OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
1989 }
1990 
1991 /// Returns a lowest register that is not used at any point in the function.
1992 ///        If all registers are used, then this function will return
1993 ///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
1994 ///         highest unused register.
1995 MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
1996                                               const TargetRegisterClass *RC,
1997                                               const MachineFunction &MF,
1998                                               bool ReserveHighestVGPR) const {
1999   if (ReserveHighestVGPR) {
2000     for (MCRegister Reg : reverse(*RC))
2001       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2002         return Reg;
2003   } else {
2004     for (MCRegister Reg : *RC)
2005       if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2006         return Reg;
2007   }
2008   return MCRegister();
2009 }
2010 
2011 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
2012                                                    unsigned EltSize) const {
2013   const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
2014   assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
2015 
2016   const unsigned RegDWORDs = RegBitWidth / 32;
2017   const unsigned EltDWORDs = EltSize / 4;
2018   assert(RegSplitParts.size() + 1 >= EltDWORDs);
2019 
2020   const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
2021   const unsigned NumParts = RegDWORDs / EltDWORDs;
2022 
2023   return makeArrayRef(Parts.data(), NumParts);
2024 }
2025 
2026 const TargetRegisterClass*
2027 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
2028                                   Register Reg) const {
2029   return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
2030 }
2031 
2032 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
2033                             Register Reg) const {
2034   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
2035   // Registers without classes are unaddressable, SGPR-like registers.
2036   return RC && hasVGPRs(RC);
2037 }
2038 
2039 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
2040                             Register Reg) const {
2041   const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
2042 
2043   // Registers without classes are unaddressable, SGPR-like registers.
2044   return RC && hasAGPRs(RC);
2045 }
2046 
2047 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
2048                                     const TargetRegisterClass *SrcRC,
2049                                     unsigned SubReg,
2050                                     const TargetRegisterClass *DstRC,
2051                                     unsigned DstSubReg,
2052                                     const TargetRegisterClass *NewRC,
2053                                     LiveIntervals &LIS) const {
2054   unsigned SrcSize = getRegSizeInBits(*SrcRC);
2055   unsigned DstSize = getRegSizeInBits(*DstRC);
2056   unsigned NewSize = getRegSizeInBits(*NewRC);
2057 
2058   // Do not increase size of registers beyond dword, we would need to allocate
2059   // adjacent registers and constraint regalloc more than needed.
2060 
2061   // Always allow dword coalescing.
2062   if (SrcSize <= 32 || DstSize <= 32)
2063     return true;
2064 
2065   return NewSize <= DstSize || NewSize <= SrcSize;
2066 }
2067 
2068 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
2069                                              MachineFunction &MF) const {
2070   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
2071 
2072   unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
2073                                                        MF.getFunction());
2074   switch (RC->getID()) {
2075   default:
2076     return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
2077   case AMDGPU::VGPR_32RegClassID:
2078   case AMDGPU::VGPR_LO16RegClassID:
2079   case AMDGPU::VGPR_HI16RegClassID:
2080     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
2081   case AMDGPU::SGPR_32RegClassID:
2082   case AMDGPU::SGPR_LO16RegClassID:
2083     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
2084   }
2085 }
2086 
2087 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
2088                                                 unsigned Idx) const {
2089   if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
2090       Idx == AMDGPU::RegisterPressureSets::AGPR_32)
2091     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
2092                                const_cast<MachineFunction &>(MF));
2093 
2094   if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
2095     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
2096                                const_cast<MachineFunction &>(MF));
2097 
2098   llvm_unreachable("Unexpected register pressure set!");
2099 }
2100 
2101 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
2102   static const int Empty[] = { -1 };
2103 
2104   if (RegPressureIgnoredUnits[RegUnit])
2105     return Empty;
2106 
2107   return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
2108 }
2109 
2110 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
2111   // Not a callee saved register.
2112   return AMDGPU::SGPR30_SGPR31;
2113 }
2114 
2115 const TargetRegisterClass *
2116 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
2117                                          const RegisterBank &RB,
2118                                          const MachineRegisterInfo &MRI) const {
2119   switch (RB.getID()) {
2120   case AMDGPU::VGPRRegBankID:
2121     return getVGPRClassForBitWidth(std::max(32u, Size));
2122   case AMDGPU::VCCRegBankID:
2123     assert(Size == 1);
2124     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2125                     : &AMDGPU::SReg_64_XEXECRegClass;
2126   case AMDGPU::SGPRRegBankID:
2127     return getSGPRClassForBitWidth(std::max(32u, Size));
2128   case AMDGPU::AGPRRegBankID:
2129     return getAGPRClassForBitWidth(std::max(32u, Size));
2130   default:
2131     llvm_unreachable("unknown register bank");
2132   }
2133 }
2134 
2135 const TargetRegisterClass *
2136 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
2137                                          const MachineRegisterInfo &MRI) const {
2138   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
2139   if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
2140     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
2141 
2142   const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
2143   return getAllocatableClass(RC);
2144 }
2145 
2146 MCRegister SIRegisterInfo::getVCC() const {
2147   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
2148 }
2149 
2150 const TargetRegisterClass *
2151 SIRegisterInfo::getRegClass(unsigned RCID) const {
2152   switch ((int)RCID) {
2153   case AMDGPU::SReg_1RegClassID:
2154     return getBoolRC();
2155   case AMDGPU::SReg_1_XEXECRegClassID:
2156     return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
2157       : &AMDGPU::SReg_64_XEXECRegClass;
2158   case -1:
2159     return nullptr;
2160   default:
2161     return AMDGPUGenRegisterInfo::getRegClass(RCID);
2162   }
2163 }
2164 
2165 // Find reaching register definition
2166 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
2167                                               MachineInstr &Use,
2168                                               MachineRegisterInfo &MRI,
2169                                               LiveIntervals *LIS) const {
2170   auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
2171   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
2172   SlotIndex DefIdx;
2173 
2174   if (Reg.isVirtual()) {
2175     if (!LIS->hasInterval(Reg))
2176       return nullptr;
2177     LiveInterval &LI = LIS->getInterval(Reg);
2178     LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
2179                                   : MRI.getMaxLaneMaskForVReg(Reg);
2180     VNInfo *V = nullptr;
2181     if (LI.hasSubRanges()) {
2182       for (auto &S : LI.subranges()) {
2183         if ((S.LaneMask & SubLanes) == SubLanes) {
2184           V = S.getVNInfoAt(UseIdx);
2185           break;
2186         }
2187       }
2188     } else {
2189       V = LI.getVNInfoAt(UseIdx);
2190     }
2191     if (!V)
2192       return nullptr;
2193     DefIdx = V->def;
2194   } else {
2195     // Find last def.
2196     for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
2197          ++Units) {
2198       LiveRange &LR = LIS->getRegUnit(*Units);
2199       if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
2200         if (!DefIdx.isValid() ||
2201             MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
2202                           LIS->getInstructionFromIndex(V->def)))
2203           DefIdx = V->def;
2204       } else {
2205         return nullptr;
2206       }
2207     }
2208   }
2209 
2210   MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
2211 
2212   if (!Def || !MDT.dominates(Def, &Use))
2213     return nullptr;
2214 
2215   assert(Def->modifiesRegister(Reg, this));
2216 
2217   return Def;
2218 }
2219 
2220 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
2221   assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
2222 
2223   for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
2224                                          AMDGPU::SReg_32RegClass,
2225                                          AMDGPU::AGPR_32RegClass } ) {
2226     if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
2227       return Super;
2228   }
2229   if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
2230                                             &AMDGPU::VGPR_32RegClass)) {
2231       return Super;
2232   }
2233 
2234   return AMDGPU::NoRegister;
2235 }
2236 
2237 bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
2238   switch (PhysReg) {
2239   case AMDGPU::SGPR_NULL:
2240   case AMDGPU::SRC_SHARED_BASE:
2241   case AMDGPU::SRC_PRIVATE_BASE:
2242   case AMDGPU::SRC_SHARED_LIMIT:
2243   case AMDGPU::SRC_PRIVATE_LIMIT:
2244     return true;
2245   default:
2246     return false;
2247   }
2248 }
2249 
2250 ArrayRef<MCPhysReg>
2251 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
2252   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
2253                       ST.getMaxNumSGPRs(MF) / 4);
2254 }
2255 
2256 ArrayRef<MCPhysReg>
2257 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
2258   return makeArrayRef(AMDGPU::SGPR_64RegClass.begin(),
2259                       ST.getMaxNumSGPRs(MF) / 2);
2260 }
2261 
2262 ArrayRef<MCPhysReg>
2263 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
2264   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
2265 }
2266