1 //===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// SI implementation of the TargetRegisterInfo class.
11 //
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPURegisterBankInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUInstPrinter.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "SIRegisterInfo.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegUnits.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/MachineFrameInfo.h"
25 #include "llvm/CodeGen/RegisterScavenging.h"
26
27 using namespace llvm;
28
29 #define GET_REGINFO_TARGET_DESC
30 #include "AMDGPUGenRegisterInfo.inc"
31
32 static cl::opt<bool> EnableSpillSGPRToVGPR(
33 "amdgpu-spill-sgpr-to-vgpr",
34 cl::desc("Enable spilling SGPRs to VGPRs"),
35 cl::ReallyHidden,
36 cl::init(true));
37
38 std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
39 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
40
41 // Map numbers of DWORDs to indexes in SubRegFromChannelTable.
42 // Valid indexes are shifted 1, such that a 0 mapping means unsupported.
43 // e.g. for 8 DWORDs (256-bit), SubRegFromChannelTableWidthMap[8] = 8,
44 // meaning index 7 in SubRegFromChannelTable.
45 static const std::array<unsigned, 17> SubRegFromChannelTableWidthMap = {
46 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 0, 0, 0, 0, 0, 0, 9};
47
48 namespace llvm {
49
50 // A temporary struct to spill SGPRs.
51 // This is mostly to spill SGPRs to memory. Spilling SGPRs into VGPR lanes emits
52 // just v_writelane and v_readlane.
53 //
54 // When spilling to memory, the SGPRs are written into VGPR lanes and the VGPR
55 // is saved to scratch (or the other way around for loads).
56 // For this, a VGPR is required where the needed lanes can be clobbered. The
57 // RegScavenger can provide a VGPR where currently active lanes can be
58 // clobbered, but we still need to save inactive lanes.
59 // The high-level steps are:
60 // - Try to scavenge SGPR(s) to save exec
61 // - Try to scavenge VGPR
62 // - Save needed, all or inactive lanes of a TmpVGPR
63 // - Spill/Restore SGPRs using TmpVGPR
64 // - Restore TmpVGPR
65 //
66 // To save all lanes of TmpVGPR, exec needs to be saved and modified. If we
67 // cannot scavenge temporary SGPRs to save exec, we use the following code:
68 // buffer_store_dword TmpVGPR ; only if active lanes need to be saved
69 // s_not exec, exec
70 // buffer_store_dword TmpVGPR ; save inactive lanes
71 // s_not exec, exec
72 struct SGPRSpillBuilder {
73 struct PerVGPRData {
74 unsigned PerVGPR;
75 unsigned NumVGPRs;
76 int64_t VGPRLanes;
77 };
78
79 // The SGPR to save
80 Register SuperReg;
81 MachineBasicBlock::iterator MI;
82 ArrayRef<int16_t> SplitParts;
83 unsigned NumSubRegs;
84 bool IsKill;
85 const DebugLoc &DL;
86
87 /* When spilling to stack */
88 // The SGPRs are written into this VGPR, which is then written to scratch
89 // (or vice versa for loads).
90 Register TmpVGPR = AMDGPU::NoRegister;
91 // Temporary spill slot to save TmpVGPR to.
92 int TmpVGPRIndex = 0;
93 // If TmpVGPR is live before the spill or if it is scavenged.
94 bool TmpVGPRLive = false;
95 // Scavenged SGPR to save EXEC.
96 Register SavedExecReg = AMDGPU::NoRegister;
97 // Stack index to write the SGPRs to.
98 int Index;
99 unsigned EltSize = 4;
100
101 RegScavenger *RS;
102 MachineBasicBlock *MBB;
103 MachineFunction &MF;
104 SIMachineFunctionInfo &MFI;
105 const SIInstrInfo &TII;
106 const SIRegisterInfo &TRI;
107 bool IsWave32;
108 Register ExecReg;
109 unsigned MovOpc;
110 unsigned NotOpc;
111
SGPRSpillBuilderllvm::SGPRSpillBuilder112 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
113 bool IsWave32, MachineBasicBlock::iterator MI, int Index,
114 RegScavenger *RS)
115 : SGPRSpillBuilder(TRI, TII, IsWave32, MI, MI->getOperand(0).getReg(),
116 MI->getOperand(0).isKill(), Index, RS) {}
117
SGPRSpillBuilderllvm::SGPRSpillBuilder118 SGPRSpillBuilder(const SIRegisterInfo &TRI, const SIInstrInfo &TII,
119 bool IsWave32, MachineBasicBlock::iterator MI, Register Reg,
120 bool IsKill, int Index, RegScavenger *RS)
121 : SuperReg(Reg), MI(MI), IsKill(IsKill), DL(MI->getDebugLoc()),
122 Index(Index), RS(RS), MBB(MI->getParent()), MF(*MBB->getParent()),
123 MFI(*MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
124 IsWave32(IsWave32) {
125 const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
126 SplitParts = TRI.getRegSplitParts(RC, EltSize);
127 NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
128
129 if (IsWave32) {
130 ExecReg = AMDGPU::EXEC_LO;
131 MovOpc = AMDGPU::S_MOV_B32;
132 NotOpc = AMDGPU::S_NOT_B32;
133 } else {
134 ExecReg = AMDGPU::EXEC;
135 MovOpc = AMDGPU::S_MOV_B64;
136 NotOpc = AMDGPU::S_NOT_B64;
137 }
138
139 assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
140 assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
141 SuperReg != AMDGPU::EXEC && "exec should never spill");
142 }
143
getPerVGPRDatallvm::SGPRSpillBuilder144 PerVGPRData getPerVGPRData() {
145 PerVGPRData Data;
146 Data.PerVGPR = IsWave32 ? 32 : 64;
147 Data.NumVGPRs = (NumSubRegs + (Data.PerVGPR - 1)) / Data.PerVGPR;
148 Data.VGPRLanes = (1LL << std::min(Data.PerVGPR, NumSubRegs)) - 1LL;
149 return Data;
150 }
151
152 // Tries to scavenge SGPRs to save EXEC and a VGPR. Uses v0 if no VGPR is
153 // free.
154 // Writes these instructions if an SGPR can be scavenged:
155 // s_mov_b64 s[6:7], exec ; Save exec
156 // s_mov_b64 exec, 3 ; Wanted lanemask
157 // buffer_store_dword v1 ; Write scavenged VGPR to emergency slot
158 //
159 // Writes these instructions if no SGPR can be scavenged:
160 // buffer_store_dword v0 ; Only if no free VGPR was found
161 // s_not_b64 exec, exec
162 // buffer_store_dword v0 ; Save inactive lanes
163 // ; exec stays inverted, it is flipped back in
164 // ; restore.
preparellvm::SGPRSpillBuilder165 void prepare() {
166 // Scavenged temporary VGPR to use. It must be scavenged once for any number
167 // of spilled subregs.
168 // FIXME: The liveness analysis is limited and does not tell if a register
169 // is in use in lanes that are currently inactive. We can never be sure if
170 // a register as actually in use in another lane, so we need to save all
171 // used lanes of the chosen VGPR.
172 assert(RS && "Cannot spill SGPR to memory without RegScavenger");
173 TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
174 0, false);
175
176 // Reserve temporary stack slot
177 TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
178 if (TmpVGPR) {
179 // Found a register that is dead in the currently active lanes, we only
180 // need to spill inactive lanes.
181 TmpVGPRLive = false;
182 } else {
183 // Pick v0 because it doesn't make a difference.
184 TmpVGPR = AMDGPU::VGPR0;
185 TmpVGPRLive = true;
186 }
187
188 if (TmpVGPRLive) {
189 // We need to inform the scavenger that this index is already in use until
190 // we're done with the custom emergency spill.
191 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR);
192 }
193
194 // We may end up recursively calling the scavenger, and don't want to re-use
195 // the same register.
196 RS->setRegUsed(TmpVGPR);
197
198 // Try to scavenge SGPRs to save exec
199 assert(!SavedExecReg && "Exec is already saved, refuse to save again");
200 const TargetRegisterClass &RC =
201 IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
202 RS->setRegUsed(SuperReg);
203 SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
204
205 int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
206
207 if (SavedExecReg) {
208 RS->setRegUsed(SavedExecReg);
209 // Set exec to needed lanes
210 BuildMI(*MBB, MI, DL, TII.get(MovOpc), SavedExecReg).addReg(ExecReg);
211 auto I =
212 BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg).addImm(VGPRLanes);
213 if (!TmpVGPRLive)
214 I.addReg(TmpVGPR, RegState::ImplicitDefine);
215 // Spill needed lanes
216 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
217 } else {
218 // The modify and restore of exec clobber SCC, which we would have to save
219 // and restore. FIXME: We probably would need to reserve a register for
220 // this.
221 if (RS->isRegUsed(AMDGPU::SCC))
222 MI->emitError("unhandled SGPR spill to memory");
223
224 // Spill active lanes
225 if (TmpVGPRLive)
226 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false,
227 /*IsKill*/ false);
228 // Spill inactive lanes
229 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
230 if (!TmpVGPRLive)
231 I.addReg(TmpVGPR, RegState::ImplicitDefine);
232 I->getOperand(2).setIsDead(); // Mark SCC as dead.
233 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ false);
234 }
235 }
236
237 // Writes these instructions if an SGPR can be scavenged:
238 // buffer_load_dword v1 ; Write scavenged VGPR to emergency slot
239 // s_waitcnt vmcnt(0) ; If a free VGPR was found
240 // s_mov_b64 exec, s[6:7] ; Save exec
241 //
242 // Writes these instructions if no SGPR can be scavenged:
243 // buffer_load_dword v0 ; Restore inactive lanes
244 // s_waitcnt vmcnt(0) ; If a free VGPR was found
245 // s_not_b64 exec, exec
246 // buffer_load_dword v0 ; Only if no free VGPR was found
restorellvm::SGPRSpillBuilder247 void restore() {
248 if (SavedExecReg) {
249 // Restore used lanes
250 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
251 /*IsKill*/ false);
252 // Restore exec
253 auto I = BuildMI(*MBB, MI, DL, TII.get(MovOpc), ExecReg)
254 .addReg(SavedExecReg, RegState::Kill);
255 // Add an implicit use of the load so it is not dead.
256 // FIXME This inserts an unnecessary waitcnt
257 if (!TmpVGPRLive) {
258 I.addReg(TmpVGPR, RegState::ImplicitKill);
259 }
260 } else {
261 // Restore inactive lanes
262 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true,
263 /*IsKill*/ false);
264 auto I = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
265 if (!TmpVGPRLive)
266 I.addReg(TmpVGPR, RegState::ImplicitKill);
267 I->getOperand(2).setIsDead(); // Mark SCC as dead.
268
269 // Restore active lanes
270 if (TmpVGPRLive)
271 TRI.buildVGPRSpillLoadStore(*this, TmpVGPRIndex, 0, /*IsLoad*/ true);
272 }
273
274 // Inform the scavenger where we're releasing our custom scavenged register.
275 if (TmpVGPRLive) {
276 MachineBasicBlock::iterator RestorePt = std::prev(MI);
277 RS->assignRegToScavengingIndex(TmpVGPRIndex, TmpVGPR, &*RestorePt);
278 }
279 }
280
281 // Write TmpVGPR to memory or read TmpVGPR from memory.
282 // Either using a single buffer_load/store if exec is set to the needed mask
283 // or using
284 // buffer_load
285 // s_not exec, exec
286 // buffer_load
287 // s_not exec, exec
readWriteTmpVGPRllvm::SGPRSpillBuilder288 void readWriteTmpVGPR(unsigned Offset, bool IsLoad) {
289 if (SavedExecReg) {
290 // Spill needed lanes
291 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
292 } else {
293 // The modify and restore of exec clobber SCC, which we would have to save
294 // and restore. FIXME: We probably would need to reserve a register for
295 // this.
296 if (RS->isRegUsed(AMDGPU::SCC))
297 MI->emitError("unhandled SGPR spill to memory");
298
299 // Spill active lanes
300 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad,
301 /*IsKill*/ false);
302 // Spill inactive lanes
303 auto Not0 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
304 Not0->getOperand(2).setIsDead(); // Mark SCC as dead.
305 TRI.buildVGPRSpillLoadStore(*this, Index, Offset, IsLoad);
306 auto Not1 = BuildMI(*MBB, MI, DL, TII.get(NotOpc), ExecReg).addReg(ExecReg);
307 Not1->getOperand(2).setIsDead(); // Mark SCC as dead.
308 }
309 }
310
setMIllvm::SGPRSpillBuilder311 void setMI(MachineBasicBlock *NewMBB, MachineBasicBlock::iterator NewMI) {
312 assert(MBB->getParent() == &MF);
313 MI = NewMI;
314 MBB = NewMBB;
315 }
316 };
317
318 } // namespace llvm
319
SIRegisterInfo(const GCNSubtarget & ST)320 SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
321 : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
322 SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
323
324 assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
325 getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
326 (getSubRegIndexLaneMask(AMDGPU::lo16) |
327 getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
328 getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
329 "getNumCoveredRegs() will not work with generated subreg masks!");
330
331 RegPressureIgnoredUnits.resize(getNumRegUnits());
332 RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
333 for (auto Reg : AMDGPU::VGPR_16RegClass) {
334 if (AMDGPU::isHi(Reg, *this))
335 RegPressureIgnoredUnits.set(*regunits(Reg).begin());
336 }
337
338 // HACK: Until this is fully tablegen'd.
339 static llvm::once_flag InitializeRegSplitPartsFlag;
340
341 static auto InitializeRegSplitPartsOnce = [this]() {
342 for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
343 unsigned Size = getSubRegIdxSize(Idx);
344 if (Size & 31)
345 continue;
346 std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
347 unsigned Pos = getSubRegIdxOffset(Idx);
348 if (Pos % Size)
349 continue;
350 Pos /= Size;
351 if (Vec.empty()) {
352 unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
353 Vec.resize(MaxNumParts);
354 }
355 Vec[Pos] = Idx;
356 }
357 };
358
359 static llvm::once_flag InitializeSubRegFromChannelTableFlag;
360
361 static auto InitializeSubRegFromChannelTableOnce = [this]() {
362 for (auto &Row : SubRegFromChannelTable)
363 Row.fill(AMDGPU::NoSubRegister);
364 for (unsigned Idx = 1; Idx < getNumSubRegIndices(); ++Idx) {
365 unsigned Width = AMDGPUSubRegIdxRanges[Idx].Size / 32;
366 unsigned Offset = AMDGPUSubRegIdxRanges[Idx].Offset / 32;
367 assert(Width < SubRegFromChannelTableWidthMap.size());
368 Width = SubRegFromChannelTableWidthMap[Width];
369 if (Width == 0)
370 continue;
371 unsigned TableIdx = Width - 1;
372 assert(TableIdx < SubRegFromChannelTable.size());
373 assert(Offset < SubRegFromChannelTable[TableIdx].size());
374 SubRegFromChannelTable[TableIdx][Offset] = Idx;
375 }
376 };
377
378 llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
379 llvm::call_once(InitializeSubRegFromChannelTableFlag,
380 InitializeSubRegFromChannelTableOnce);
381 }
382
reserveRegisterTuples(BitVector & Reserved,MCRegister Reg) const383 void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
384 MCRegister Reg) const {
385 for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
386 Reserved.set(*R);
387 }
388
389 // Forced to be here by one .inc
getCalleeSavedRegs(const MachineFunction * MF) const390 const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
391 const MachineFunction *MF) const {
392 CallingConv::ID CC = MF->getFunction().getCallingConv();
393 switch (CC) {
394 case CallingConv::C:
395 case CallingConv::Fast:
396 case CallingConv::Cold:
397 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_SaveList
398 : CSR_AMDGPU_SaveList;
399 case CallingConv::AMDGPU_Gfx:
400 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
401 : CSR_AMDGPU_SI_Gfx_SaveList;
402 case CallingConv::AMDGPU_CS_ChainPreserve:
403 return CSR_AMDGPU_CS_ChainPreserve_SaveList;
404 default: {
405 // Dummy to not crash RegisterClassInfo.
406 static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
407 return &NoCalleeSavedReg;
408 }
409 }
410 }
411
412 const MCPhysReg *
getCalleeSavedRegsViaCopy(const MachineFunction * MF) const413 SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
414 return nullptr;
415 }
416
getCallPreservedMask(const MachineFunction & MF,CallingConv::ID CC) const417 const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
418 CallingConv::ID CC) const {
419 switch (CC) {
420 case CallingConv::C:
421 case CallingConv::Fast:
422 case CallingConv::Cold:
423 return ST.hasGFX90AInsts() ? CSR_AMDGPU_GFX90AInsts_RegMask
424 : CSR_AMDGPU_RegMask;
425 case CallingConv::AMDGPU_Gfx:
426 return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
427 : CSR_AMDGPU_SI_Gfx_RegMask;
428 case CallingConv::AMDGPU_CS_Chain:
429 case CallingConv::AMDGPU_CS_ChainPreserve:
430 // Calls to these functions never return, so we can pretend everything is
431 // preserved.
432 return AMDGPU_AllVGPRs_RegMask;
433 default:
434 return nullptr;
435 }
436 }
437
getNoPreservedMask() const438 const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
439 return CSR_AMDGPU_NoRegs_RegMask;
440 }
441
isChainScratchRegister(Register VGPR)442 bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
443 return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
444 }
445
446 const TargetRegisterClass *
getLargestLegalSuperClass(const TargetRegisterClass * RC,const MachineFunction & MF) const447 SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
448 const MachineFunction &MF) const {
449 // FIXME: Should have a helper function like getEquivalentVGPRClass to get the
450 // equivalent AV class. If used one, the verifier will crash after
451 // RegBankSelect in the GISel flow. The aligned regclasses are not fully given
452 // until Instruction selection.
453 if (ST.hasMAIInsts() && (isVGPRClass(RC) || isAGPRClass(RC))) {
454 if (RC == &AMDGPU::VGPR_32RegClass || RC == &AMDGPU::AGPR_32RegClass)
455 return &AMDGPU::AV_32RegClass;
456 if (RC == &AMDGPU::VReg_64RegClass || RC == &AMDGPU::AReg_64RegClass)
457 return &AMDGPU::AV_64RegClass;
458 if (RC == &AMDGPU::VReg_64_Align2RegClass ||
459 RC == &AMDGPU::AReg_64_Align2RegClass)
460 return &AMDGPU::AV_64_Align2RegClass;
461 if (RC == &AMDGPU::VReg_96RegClass || RC == &AMDGPU::AReg_96RegClass)
462 return &AMDGPU::AV_96RegClass;
463 if (RC == &AMDGPU::VReg_96_Align2RegClass ||
464 RC == &AMDGPU::AReg_96_Align2RegClass)
465 return &AMDGPU::AV_96_Align2RegClass;
466 if (RC == &AMDGPU::VReg_128RegClass || RC == &AMDGPU::AReg_128RegClass)
467 return &AMDGPU::AV_128RegClass;
468 if (RC == &AMDGPU::VReg_128_Align2RegClass ||
469 RC == &AMDGPU::AReg_128_Align2RegClass)
470 return &AMDGPU::AV_128_Align2RegClass;
471 if (RC == &AMDGPU::VReg_160RegClass || RC == &AMDGPU::AReg_160RegClass)
472 return &AMDGPU::AV_160RegClass;
473 if (RC == &AMDGPU::VReg_160_Align2RegClass ||
474 RC == &AMDGPU::AReg_160_Align2RegClass)
475 return &AMDGPU::AV_160_Align2RegClass;
476 if (RC == &AMDGPU::VReg_192RegClass || RC == &AMDGPU::AReg_192RegClass)
477 return &AMDGPU::AV_192RegClass;
478 if (RC == &AMDGPU::VReg_192_Align2RegClass ||
479 RC == &AMDGPU::AReg_192_Align2RegClass)
480 return &AMDGPU::AV_192_Align2RegClass;
481 if (RC == &AMDGPU::VReg_256RegClass || RC == &AMDGPU::AReg_256RegClass)
482 return &AMDGPU::AV_256RegClass;
483 if (RC == &AMDGPU::VReg_256_Align2RegClass ||
484 RC == &AMDGPU::AReg_256_Align2RegClass)
485 return &AMDGPU::AV_256_Align2RegClass;
486 if (RC == &AMDGPU::VReg_512RegClass || RC == &AMDGPU::AReg_512RegClass)
487 return &AMDGPU::AV_512RegClass;
488 if (RC == &AMDGPU::VReg_512_Align2RegClass ||
489 RC == &AMDGPU::AReg_512_Align2RegClass)
490 return &AMDGPU::AV_512_Align2RegClass;
491 if (RC == &AMDGPU::VReg_1024RegClass || RC == &AMDGPU::AReg_1024RegClass)
492 return &AMDGPU::AV_1024RegClass;
493 if (RC == &AMDGPU::VReg_1024_Align2RegClass ||
494 RC == &AMDGPU::AReg_1024_Align2RegClass)
495 return &AMDGPU::AV_1024_Align2RegClass;
496 }
497
498 return TargetRegisterInfo::getLargestLegalSuperClass(RC, MF);
499 }
500
getFrameRegister(const MachineFunction & MF) const501 Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
502 const SIFrameLowering *TFI = ST.getFrameLowering();
503 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
504 // During ISel lowering we always reserve the stack pointer in entry and chain
505 // functions, but never actually want to reference it when accessing our own
506 // frame. If we need a frame pointer we use it, but otherwise we can just use
507 // an immediate "0" which we represent by returning NoRegister.
508 if (FuncInfo->isBottomOfStack()) {
509 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
510 }
511 return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
512 : FuncInfo->getStackPtrOffsetReg();
513 }
514
hasBasePointer(const MachineFunction & MF) const515 bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
516 // When we need stack realignment, we can't reference off of the
517 // stack pointer, so we reserve a base pointer.
518 const MachineFrameInfo &MFI = MF.getFrameInfo();
519 return MFI.getNumFixedObjects() && shouldRealignStack(MF);
520 }
521
getBaseRegister() const522 Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
523
getAllVGPRRegMask() const524 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
525 return AMDGPU_AllVGPRs_RegMask;
526 }
527
getAllAGPRRegMask() const528 const uint32_t *SIRegisterInfo::getAllAGPRRegMask() const {
529 return AMDGPU_AllAGPRs_RegMask;
530 }
531
getAllVectorRegMask() const532 const uint32_t *SIRegisterInfo::getAllVectorRegMask() const {
533 return AMDGPU_AllVectorRegs_RegMask;
534 }
535
getAllAllocatableSRegMask() const536 const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
537 return AMDGPU_AllAllocatableSRegs_RegMask;
538 }
539
getSubRegFromChannel(unsigned Channel,unsigned NumRegs)540 unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
541 unsigned NumRegs) {
542 assert(NumRegs < SubRegFromChannelTableWidthMap.size());
543 unsigned NumRegIndex = SubRegFromChannelTableWidthMap[NumRegs];
544 assert(NumRegIndex && "Not implemented");
545 assert(Channel < SubRegFromChannelTable[NumRegIndex - 1].size());
546 return SubRegFromChannelTable[NumRegIndex - 1][Channel];
547 }
548
549 MCRegister
getAlignedHighSGPRForRC(const MachineFunction & MF,const unsigned Align,const TargetRegisterClass * RC) const550 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
551 const unsigned Align,
552 const TargetRegisterClass *RC) const {
553 unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
554 MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
555 return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
556 }
557
reservedPrivateSegmentBufferReg(const MachineFunction & MF) const558 MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
559 const MachineFunction &MF) const {
560 return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
561 }
562
getReservedRegs(const MachineFunction & MF) const563 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
564 BitVector Reserved(getNumRegs());
565 Reserved.set(AMDGPU::MODE);
566
567 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
568
569 // Reserve special purpose registers.
570 //
571 // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
572 // this seems likely to result in bugs, so I'm marking them as reserved.
573 reserveRegisterTuples(Reserved, AMDGPU::EXEC);
574 reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
575
576 // M0 has to be reserved so that llvm accepts it as a live-in into a block.
577 reserveRegisterTuples(Reserved, AMDGPU::M0);
578
579 // Reserve src_vccz, src_execz, src_scc.
580 reserveRegisterTuples(Reserved, AMDGPU::SRC_VCCZ);
581 reserveRegisterTuples(Reserved, AMDGPU::SRC_EXECZ);
582 reserveRegisterTuples(Reserved, AMDGPU::SRC_SCC);
583
584 // Reserve the memory aperture registers
585 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
586 reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
587 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
588 reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
589
590 // Reserve src_pops_exiting_wave_id - support is not implemented in Codegen.
591 reserveRegisterTuples(Reserved, AMDGPU::SRC_POPS_EXITING_WAVE_ID);
592
593 // Reserve xnack_mask registers - support is not implemented in Codegen.
594 reserveRegisterTuples(Reserved, AMDGPU::XNACK_MASK);
595
596 // Reserve lds_direct register - support is not implemented in Codegen.
597 reserveRegisterTuples(Reserved, AMDGPU::LDS_DIRECT);
598
599 // Reserve Trap Handler registers - support is not implemented in Codegen.
600 reserveRegisterTuples(Reserved, AMDGPU::TBA);
601 reserveRegisterTuples(Reserved, AMDGPU::TMA);
602 reserveRegisterTuples(Reserved, AMDGPU::TTMP0_TTMP1);
603 reserveRegisterTuples(Reserved, AMDGPU::TTMP2_TTMP3);
604 reserveRegisterTuples(Reserved, AMDGPU::TTMP4_TTMP5);
605 reserveRegisterTuples(Reserved, AMDGPU::TTMP6_TTMP7);
606 reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
607 reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
608 reserveRegisterTuples(Reserved, AMDGPU::TTMP12_TTMP13);
609 reserveRegisterTuples(Reserved, AMDGPU::TTMP14_TTMP15);
610
611 // Reserve null register - it shall never be allocated
612 reserveRegisterTuples(Reserved, AMDGPU::SGPR_NULL64);
613
614 // Disallow vcc_hi allocation in wave32. It may be allocated but most likely
615 // will result in bugs.
616 if (isWave32) {
617 Reserved.set(AMDGPU::VCC);
618 Reserved.set(AMDGPU::VCC_HI);
619 }
620
621 // Reserve SGPRs.
622 //
623 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
624 unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
625 for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
626 unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
627 reserveRegisterTuples(Reserved, Reg);
628 }
629
630 Register ScratchRSrcReg = MFI->getScratchRSrcReg();
631 if (ScratchRSrcReg != AMDGPU::NoRegister) {
632 // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
633 // need to spill.
634 // TODO: May need to reserve a VGPR if doing LDS spilling.
635 reserveRegisterTuples(Reserved, ScratchRSrcReg);
636 }
637
638 Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
639 if (LongBranchReservedReg)
640 reserveRegisterTuples(Reserved, LongBranchReservedReg);
641
642 // We have to assume the SP is needed in case there are calls in the function,
643 // which is detected after the function is lowered. If we aren't really going
644 // to need SP, don't bother reserving it.
645 MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
646 if (StackPtrReg) {
647 reserveRegisterTuples(Reserved, StackPtrReg);
648 assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
649 }
650
651 MCRegister FrameReg = MFI->getFrameOffsetReg();
652 if (FrameReg) {
653 reserveRegisterTuples(Reserved, FrameReg);
654 assert(!isSubRegister(ScratchRSrcReg, FrameReg));
655 }
656
657 if (hasBasePointer(MF)) {
658 MCRegister BasePtrReg = getBaseRegister();
659 reserveRegisterTuples(Reserved, BasePtrReg);
660 assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
661 }
662
663 // FIXME: Use same reserved register introduced in D149775
664 // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
665 Register ExecCopyReg = MFI->getSGPRForEXECCopy();
666 if (ExecCopyReg)
667 reserveRegisterTuples(Reserved, ExecCopyReg);
668
669 // Reserve VGPRs/AGPRs.
670 //
671 unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
672 unsigned MaxNumAGPRs = MaxNumVGPRs;
673 unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
674
675 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
676 // a wave may have up to 512 total vector registers combining together both
677 // VGPRs and AGPRs. Hence, in an entry function without calls and without
678 // AGPRs used within it, it is possible to use the whole vector register
679 // budget for VGPRs.
680 //
681 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
682 // register file accordingly.
683 if (ST.hasGFX90AInsts()) {
684 if (MFI->usesAGPRs(MF)) {
685 MaxNumVGPRs /= 2;
686 MaxNumAGPRs = MaxNumVGPRs;
687 } else {
688 if (MaxNumVGPRs > TotalNumVGPRs) {
689 MaxNumAGPRs = MaxNumVGPRs - TotalNumVGPRs;
690 MaxNumVGPRs = TotalNumVGPRs;
691 } else
692 MaxNumAGPRs = 0;
693 }
694 }
695
696 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
697 unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
698 reserveRegisterTuples(Reserved, Reg);
699 }
700
701 if (ST.hasMAIInsts()) {
702 for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
703 unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
704 reserveRegisterTuples(Reserved, Reg);
705 }
706 } else {
707 // Reserve all the AGPRs if there are no instructions to use it.
708 for (MCRegister Reg : AMDGPU::AGPR_32RegClass)
709 reserveRegisterTuples(Reserved, Reg);
710 }
711
712 // On GFX908, in order to guarantee copying between AGPRs, we need a scratch
713 // VGPR available at all times.
714 if (ST.hasMAIInsts() && !ST.hasGFX90AInsts()) {
715 reserveRegisterTuples(Reserved, MFI->getVGPRForAGPRCopy());
716 }
717
718 for (Register Reg : MFI->getWWMReservedRegs())
719 reserveRegisterTuples(Reserved, Reg);
720
721 // FIXME: Stop using reserved registers for this.
722 for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
723 reserveRegisterTuples(Reserved, Reg);
724
725 for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
726 reserveRegisterTuples(Reserved, Reg);
727
728 return Reserved;
729 }
730
isAsmClobberable(const MachineFunction & MF,MCRegister PhysReg) const731 bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
732 MCRegister PhysReg) const {
733 return !MF.getRegInfo().isReserved(PhysReg);
734 }
735
shouldRealignStack(const MachineFunction & MF) const736 bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
737 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
738 // On entry or in chain functions, the base address is 0, so it can't possibly
739 // need any more alignment.
740
741 // FIXME: Should be able to specify the entry frame alignment per calling
742 // convention instead.
743 if (Info->isBottomOfStack())
744 return false;
745
746 return TargetRegisterInfo::shouldRealignStack(MF);
747 }
748
requiresRegisterScavenging(const MachineFunction & Fn) const749 bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const {
750 const SIMachineFunctionInfo *Info = Fn.getInfo<SIMachineFunctionInfo>();
751 if (Info->isEntryFunction()) {
752 const MachineFrameInfo &MFI = Fn.getFrameInfo();
753 return MFI.hasStackObjects() || MFI.hasCalls();
754 }
755
756 // May need scavenger for dealing with callee saved registers.
757 return true;
758 }
759
requiresFrameIndexScavenging(const MachineFunction & MF) const760 bool SIRegisterInfo::requiresFrameIndexScavenging(
761 const MachineFunction &MF) const {
762 // Do not use frame virtual registers. They used to be used for SGPRs, but
763 // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
764 // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
765 // spill.
766 return false;
767 }
768
requiresFrameIndexReplacementScavenging(const MachineFunction & MF) const769 bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
770 const MachineFunction &MF) const {
771 const MachineFrameInfo &MFI = MF.getFrameInfo();
772 return MFI.hasStackObjects();
773 }
774
requiresVirtualBaseRegisters(const MachineFunction &) const775 bool SIRegisterInfo::requiresVirtualBaseRegisters(
776 const MachineFunction &) const {
777 // There are no special dedicated stack or frame pointers.
778 return true;
779 }
780
getScratchInstrOffset(const MachineInstr * MI) const781 int64_t SIRegisterInfo::getScratchInstrOffset(const MachineInstr *MI) const {
782 assert(SIInstrInfo::isMUBUF(*MI) || SIInstrInfo::isFLATScratch(*MI));
783
784 int OffIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
785 AMDGPU::OpName::offset);
786 return MI->getOperand(OffIdx).getImm();
787 }
788
getFrameIndexInstrOffset(const MachineInstr * MI,int Idx) const789 int64_t SIRegisterInfo::getFrameIndexInstrOffset(const MachineInstr *MI,
790 int Idx) const {
791 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
792 return 0;
793
794 assert((Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
795 AMDGPU::OpName::vaddr) ||
796 (Idx == AMDGPU::getNamedOperandIdx(MI->getOpcode(),
797 AMDGPU::OpName::saddr))) &&
798 "Should never see frame index on non-address operand");
799
800 return getScratchInstrOffset(MI);
801 }
802
needsFrameBaseReg(MachineInstr * MI,int64_t Offset) const803 bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
804 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
805 return false;
806
807 int64_t FullOffset = Offset + getScratchInstrOffset(MI);
808
809 const SIInstrInfo *TII = ST.getInstrInfo();
810 if (SIInstrInfo::isMUBUF(*MI))
811 return !TII->isLegalMUBUFImmOffset(FullOffset);
812
813 return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
814 SIInstrFlags::FlatScratch);
815 }
816
materializeFrameBaseRegister(MachineBasicBlock * MBB,int FrameIdx,int64_t Offset) const817 Register SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
818 int FrameIdx,
819 int64_t Offset) const {
820 MachineBasicBlock::iterator Ins = MBB->begin();
821 DebugLoc DL; // Defaults to "unknown"
822
823 if (Ins != MBB->end())
824 DL = Ins->getDebugLoc();
825
826 MachineFunction *MF = MBB->getParent();
827 const SIInstrInfo *TII = ST.getInstrInfo();
828 MachineRegisterInfo &MRI = MF->getRegInfo();
829 unsigned MovOpc = ST.enableFlatScratch() ? AMDGPU::S_MOV_B32
830 : AMDGPU::V_MOV_B32_e32;
831
832 Register BaseReg = MRI.createVirtualRegister(
833 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XEXEC_HIRegClass
834 : &AMDGPU::VGPR_32RegClass);
835
836 if (Offset == 0) {
837 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), BaseReg)
838 .addFrameIndex(FrameIdx);
839 return BaseReg;
840 }
841
842 Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
843
844 Register FIReg = MRI.createVirtualRegister(
845 ST.enableFlatScratch() ? &AMDGPU::SReg_32_XM0RegClass
846 : &AMDGPU::VGPR_32RegClass);
847
848 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
849 .addImm(Offset);
850 BuildMI(*MBB, Ins, DL, TII->get(MovOpc), FIReg)
851 .addFrameIndex(FrameIdx);
852
853 if (ST.enableFlatScratch() ) {
854 BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_ADD_I32), BaseReg)
855 .addReg(OffsetReg, RegState::Kill)
856 .addReg(FIReg);
857 return BaseReg;
858 }
859
860 TII->getAddNoCarry(*MBB, Ins, DL, BaseReg)
861 .addReg(OffsetReg, RegState::Kill)
862 .addReg(FIReg)
863 .addImm(0); // clamp bit
864
865 return BaseReg;
866 }
867
resolveFrameIndex(MachineInstr & MI,Register BaseReg,int64_t Offset) const868 void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
869 int64_t Offset) const {
870 const SIInstrInfo *TII = ST.getInstrInfo();
871 bool IsFlat = TII->isFLATScratch(MI);
872
873 #ifndef NDEBUG
874 // FIXME: Is it possible to be storing a frame index to itself?
875 bool SeenFI = false;
876 for (const MachineOperand &MO: MI.operands()) {
877 if (MO.isFI()) {
878 if (SeenFI)
879 llvm_unreachable("should not see multiple frame indices");
880
881 SeenFI = true;
882 }
883 }
884 #endif
885
886 MachineOperand *FIOp =
887 TII->getNamedOperand(MI, IsFlat ? AMDGPU::OpName::saddr
888 : AMDGPU::OpName::vaddr);
889
890 MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
891 int64_t NewOffset = OffsetOp->getImm() + Offset;
892
893 assert(FIOp && FIOp->isFI() && "frame index must be address operand");
894 assert(TII->isMUBUF(MI) || TII->isFLATScratch(MI));
895
896 if (IsFlat) {
897 assert(TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
898 SIInstrFlags::FlatScratch) &&
899 "offset should be legal");
900 FIOp->ChangeToRegister(BaseReg, false);
901 OffsetOp->setImm(NewOffset);
902 return;
903 }
904
905 #ifndef NDEBUG
906 MachineOperand *SOffset = TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
907 assert(SOffset->isImm() && SOffset->getImm() == 0);
908 #endif
909
910 assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
911
912 FIOp->ChangeToRegister(BaseReg, false);
913 OffsetOp->setImm(NewOffset);
914 }
915
isFrameOffsetLegal(const MachineInstr * MI,Register BaseReg,int64_t Offset) const916 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
917 Register BaseReg,
918 int64_t Offset) const {
919 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isFLATScratch(*MI))
920 return false;
921
922 int64_t NewOffset = Offset + getScratchInstrOffset(MI);
923
924 const SIInstrInfo *TII = ST.getInstrInfo();
925 if (SIInstrInfo::isMUBUF(*MI))
926 return TII->isLegalMUBUFImmOffset(NewOffset);
927
928 return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
929 SIInstrFlags::FlatScratch);
930 }
931
getPointerRegClass(const MachineFunction & MF,unsigned Kind) const932 const TargetRegisterClass *SIRegisterInfo::getPointerRegClass(
933 const MachineFunction &MF, unsigned Kind) const {
934 // This is inaccurate. It depends on the instruction and address space. The
935 // only place where we should hit this is for dealing with frame indexes /
936 // private accesses, so this is correct in that case.
937 return &AMDGPU::VGPR_32RegClass;
938 }
939
940 const TargetRegisterClass *
getCrossCopyRegClass(const TargetRegisterClass * RC) const941 SIRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
942 if (isAGPRClass(RC) && !ST.hasGFX90AInsts())
943 return getEquivalentVGPRClass(RC);
944 if (RC == &AMDGPU::SCC_CLASSRegClass)
945 return getWaveMaskRegClass();
946
947 return RC;
948 }
949
getNumSubRegsForSpillOp(unsigned Op)950 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
951
952 switch (Op) {
953 case AMDGPU::SI_SPILL_S1024_SAVE:
954 case AMDGPU::SI_SPILL_S1024_RESTORE:
955 case AMDGPU::SI_SPILL_V1024_SAVE:
956 case AMDGPU::SI_SPILL_V1024_RESTORE:
957 case AMDGPU::SI_SPILL_A1024_SAVE:
958 case AMDGPU::SI_SPILL_A1024_RESTORE:
959 case AMDGPU::SI_SPILL_AV1024_SAVE:
960 case AMDGPU::SI_SPILL_AV1024_RESTORE:
961 return 32;
962 case AMDGPU::SI_SPILL_S512_SAVE:
963 case AMDGPU::SI_SPILL_S512_RESTORE:
964 case AMDGPU::SI_SPILL_V512_SAVE:
965 case AMDGPU::SI_SPILL_V512_RESTORE:
966 case AMDGPU::SI_SPILL_A512_SAVE:
967 case AMDGPU::SI_SPILL_A512_RESTORE:
968 case AMDGPU::SI_SPILL_AV512_SAVE:
969 case AMDGPU::SI_SPILL_AV512_RESTORE:
970 return 16;
971 case AMDGPU::SI_SPILL_S384_SAVE:
972 case AMDGPU::SI_SPILL_S384_RESTORE:
973 case AMDGPU::SI_SPILL_V384_SAVE:
974 case AMDGPU::SI_SPILL_V384_RESTORE:
975 case AMDGPU::SI_SPILL_A384_SAVE:
976 case AMDGPU::SI_SPILL_A384_RESTORE:
977 case AMDGPU::SI_SPILL_AV384_SAVE:
978 case AMDGPU::SI_SPILL_AV384_RESTORE:
979 return 12;
980 case AMDGPU::SI_SPILL_S352_SAVE:
981 case AMDGPU::SI_SPILL_S352_RESTORE:
982 case AMDGPU::SI_SPILL_V352_SAVE:
983 case AMDGPU::SI_SPILL_V352_RESTORE:
984 case AMDGPU::SI_SPILL_A352_SAVE:
985 case AMDGPU::SI_SPILL_A352_RESTORE:
986 case AMDGPU::SI_SPILL_AV352_SAVE:
987 case AMDGPU::SI_SPILL_AV352_RESTORE:
988 return 11;
989 case AMDGPU::SI_SPILL_S320_SAVE:
990 case AMDGPU::SI_SPILL_S320_RESTORE:
991 case AMDGPU::SI_SPILL_V320_SAVE:
992 case AMDGPU::SI_SPILL_V320_RESTORE:
993 case AMDGPU::SI_SPILL_A320_SAVE:
994 case AMDGPU::SI_SPILL_A320_RESTORE:
995 case AMDGPU::SI_SPILL_AV320_SAVE:
996 case AMDGPU::SI_SPILL_AV320_RESTORE:
997 return 10;
998 case AMDGPU::SI_SPILL_S288_SAVE:
999 case AMDGPU::SI_SPILL_S288_RESTORE:
1000 case AMDGPU::SI_SPILL_V288_SAVE:
1001 case AMDGPU::SI_SPILL_V288_RESTORE:
1002 case AMDGPU::SI_SPILL_A288_SAVE:
1003 case AMDGPU::SI_SPILL_A288_RESTORE:
1004 case AMDGPU::SI_SPILL_AV288_SAVE:
1005 case AMDGPU::SI_SPILL_AV288_RESTORE:
1006 return 9;
1007 case AMDGPU::SI_SPILL_S256_SAVE:
1008 case AMDGPU::SI_SPILL_S256_RESTORE:
1009 case AMDGPU::SI_SPILL_V256_SAVE:
1010 case AMDGPU::SI_SPILL_V256_RESTORE:
1011 case AMDGPU::SI_SPILL_A256_SAVE:
1012 case AMDGPU::SI_SPILL_A256_RESTORE:
1013 case AMDGPU::SI_SPILL_AV256_SAVE:
1014 case AMDGPU::SI_SPILL_AV256_RESTORE:
1015 return 8;
1016 case AMDGPU::SI_SPILL_S224_SAVE:
1017 case AMDGPU::SI_SPILL_S224_RESTORE:
1018 case AMDGPU::SI_SPILL_V224_SAVE:
1019 case AMDGPU::SI_SPILL_V224_RESTORE:
1020 case AMDGPU::SI_SPILL_A224_SAVE:
1021 case AMDGPU::SI_SPILL_A224_RESTORE:
1022 case AMDGPU::SI_SPILL_AV224_SAVE:
1023 case AMDGPU::SI_SPILL_AV224_RESTORE:
1024 return 7;
1025 case AMDGPU::SI_SPILL_S192_SAVE:
1026 case AMDGPU::SI_SPILL_S192_RESTORE:
1027 case AMDGPU::SI_SPILL_V192_SAVE:
1028 case AMDGPU::SI_SPILL_V192_RESTORE:
1029 case AMDGPU::SI_SPILL_A192_SAVE:
1030 case AMDGPU::SI_SPILL_A192_RESTORE:
1031 case AMDGPU::SI_SPILL_AV192_SAVE:
1032 case AMDGPU::SI_SPILL_AV192_RESTORE:
1033 return 6;
1034 case AMDGPU::SI_SPILL_S160_SAVE:
1035 case AMDGPU::SI_SPILL_S160_RESTORE:
1036 case AMDGPU::SI_SPILL_V160_SAVE:
1037 case AMDGPU::SI_SPILL_V160_RESTORE:
1038 case AMDGPU::SI_SPILL_A160_SAVE:
1039 case AMDGPU::SI_SPILL_A160_RESTORE:
1040 case AMDGPU::SI_SPILL_AV160_SAVE:
1041 case AMDGPU::SI_SPILL_AV160_RESTORE:
1042 return 5;
1043 case AMDGPU::SI_SPILL_S128_SAVE:
1044 case AMDGPU::SI_SPILL_S128_RESTORE:
1045 case AMDGPU::SI_SPILL_V128_SAVE:
1046 case AMDGPU::SI_SPILL_V128_RESTORE:
1047 case AMDGPU::SI_SPILL_A128_SAVE:
1048 case AMDGPU::SI_SPILL_A128_RESTORE:
1049 case AMDGPU::SI_SPILL_AV128_SAVE:
1050 case AMDGPU::SI_SPILL_AV128_RESTORE:
1051 return 4;
1052 case AMDGPU::SI_SPILL_S96_SAVE:
1053 case AMDGPU::SI_SPILL_S96_RESTORE:
1054 case AMDGPU::SI_SPILL_V96_SAVE:
1055 case AMDGPU::SI_SPILL_V96_RESTORE:
1056 case AMDGPU::SI_SPILL_A96_SAVE:
1057 case AMDGPU::SI_SPILL_A96_RESTORE:
1058 case AMDGPU::SI_SPILL_AV96_SAVE:
1059 case AMDGPU::SI_SPILL_AV96_RESTORE:
1060 return 3;
1061 case AMDGPU::SI_SPILL_S64_SAVE:
1062 case AMDGPU::SI_SPILL_S64_RESTORE:
1063 case AMDGPU::SI_SPILL_V64_SAVE:
1064 case AMDGPU::SI_SPILL_V64_RESTORE:
1065 case AMDGPU::SI_SPILL_A64_SAVE:
1066 case AMDGPU::SI_SPILL_A64_RESTORE:
1067 case AMDGPU::SI_SPILL_AV64_SAVE:
1068 case AMDGPU::SI_SPILL_AV64_RESTORE:
1069 return 2;
1070 case AMDGPU::SI_SPILL_S32_SAVE:
1071 case AMDGPU::SI_SPILL_S32_RESTORE:
1072 case AMDGPU::SI_SPILL_V32_SAVE:
1073 case AMDGPU::SI_SPILL_V32_RESTORE:
1074 case AMDGPU::SI_SPILL_A32_SAVE:
1075 case AMDGPU::SI_SPILL_A32_RESTORE:
1076 case AMDGPU::SI_SPILL_AV32_SAVE:
1077 case AMDGPU::SI_SPILL_AV32_RESTORE:
1078 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
1079 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
1080 case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
1081 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
1082 return 1;
1083 default: llvm_unreachable("Invalid spill opcode");
1084 }
1085 }
1086
getOffsetMUBUFStore(unsigned Opc)1087 static int getOffsetMUBUFStore(unsigned Opc) {
1088 switch (Opc) {
1089 case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
1090 return AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1091 case AMDGPU::BUFFER_STORE_BYTE_OFFEN:
1092 return AMDGPU::BUFFER_STORE_BYTE_OFFSET;
1093 case AMDGPU::BUFFER_STORE_SHORT_OFFEN:
1094 return AMDGPU::BUFFER_STORE_SHORT_OFFSET;
1095 case AMDGPU::BUFFER_STORE_DWORDX2_OFFEN:
1096 return AMDGPU::BUFFER_STORE_DWORDX2_OFFSET;
1097 case AMDGPU::BUFFER_STORE_DWORDX3_OFFEN:
1098 return AMDGPU::BUFFER_STORE_DWORDX3_OFFSET;
1099 case AMDGPU::BUFFER_STORE_DWORDX4_OFFEN:
1100 return AMDGPU::BUFFER_STORE_DWORDX4_OFFSET;
1101 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN:
1102 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET;
1103 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN:
1104 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET;
1105 default:
1106 return -1;
1107 }
1108 }
1109
getOffsetMUBUFLoad(unsigned Opc)1110 static int getOffsetMUBUFLoad(unsigned Opc) {
1111 switch (Opc) {
1112 case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
1113 return AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1114 case AMDGPU::BUFFER_LOAD_UBYTE_OFFEN:
1115 return AMDGPU::BUFFER_LOAD_UBYTE_OFFSET;
1116 case AMDGPU::BUFFER_LOAD_SBYTE_OFFEN:
1117 return AMDGPU::BUFFER_LOAD_SBYTE_OFFSET;
1118 case AMDGPU::BUFFER_LOAD_USHORT_OFFEN:
1119 return AMDGPU::BUFFER_LOAD_USHORT_OFFSET;
1120 case AMDGPU::BUFFER_LOAD_SSHORT_OFFEN:
1121 return AMDGPU::BUFFER_LOAD_SSHORT_OFFSET;
1122 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN:
1123 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET;
1124 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN:
1125 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET;
1126 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN:
1127 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET;
1128 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN:
1129 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET;
1130 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN:
1131 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET;
1132 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN:
1133 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET;
1134 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN:
1135 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET;
1136 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN:
1137 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET;
1138 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN:
1139 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET;
1140 default:
1141 return -1;
1142 }
1143 }
1144
getOffenMUBUFStore(unsigned Opc)1145 static int getOffenMUBUFStore(unsigned Opc) {
1146 switch (Opc) {
1147 case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
1148 return AMDGPU::BUFFER_STORE_DWORD_OFFEN;
1149 case AMDGPU::BUFFER_STORE_BYTE_OFFSET:
1150 return AMDGPU::BUFFER_STORE_BYTE_OFFEN;
1151 case AMDGPU::BUFFER_STORE_SHORT_OFFSET:
1152 return AMDGPU::BUFFER_STORE_SHORT_OFFEN;
1153 case AMDGPU::BUFFER_STORE_DWORDX2_OFFSET:
1154 return AMDGPU::BUFFER_STORE_DWORDX2_OFFEN;
1155 case AMDGPU::BUFFER_STORE_DWORDX3_OFFSET:
1156 return AMDGPU::BUFFER_STORE_DWORDX3_OFFEN;
1157 case AMDGPU::BUFFER_STORE_DWORDX4_OFFSET:
1158 return AMDGPU::BUFFER_STORE_DWORDX4_OFFEN;
1159 case AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFSET:
1160 return AMDGPU::BUFFER_STORE_SHORT_D16_HI_OFFEN;
1161 case AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFSET:
1162 return AMDGPU::BUFFER_STORE_BYTE_D16_HI_OFFEN;
1163 default:
1164 return -1;
1165 }
1166 }
1167
getOffenMUBUFLoad(unsigned Opc)1168 static int getOffenMUBUFLoad(unsigned Opc) {
1169 switch (Opc) {
1170 case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
1171 return AMDGPU::BUFFER_LOAD_DWORD_OFFEN;
1172 case AMDGPU::BUFFER_LOAD_UBYTE_OFFSET:
1173 return AMDGPU::BUFFER_LOAD_UBYTE_OFFEN;
1174 case AMDGPU::BUFFER_LOAD_SBYTE_OFFSET:
1175 return AMDGPU::BUFFER_LOAD_SBYTE_OFFEN;
1176 case AMDGPU::BUFFER_LOAD_USHORT_OFFSET:
1177 return AMDGPU::BUFFER_LOAD_USHORT_OFFEN;
1178 case AMDGPU::BUFFER_LOAD_SSHORT_OFFSET:
1179 return AMDGPU::BUFFER_LOAD_SSHORT_OFFEN;
1180 case AMDGPU::BUFFER_LOAD_DWORDX2_OFFSET:
1181 return AMDGPU::BUFFER_LOAD_DWORDX2_OFFEN;
1182 case AMDGPU::BUFFER_LOAD_DWORDX3_OFFSET:
1183 return AMDGPU::BUFFER_LOAD_DWORDX3_OFFEN;
1184 case AMDGPU::BUFFER_LOAD_DWORDX4_OFFSET:
1185 return AMDGPU::BUFFER_LOAD_DWORDX4_OFFEN;
1186 case AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFSET:
1187 return AMDGPU::BUFFER_LOAD_UBYTE_D16_OFFEN;
1188 case AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFSET:
1189 return AMDGPU::BUFFER_LOAD_UBYTE_D16_HI_OFFEN;
1190 case AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFSET:
1191 return AMDGPU::BUFFER_LOAD_SBYTE_D16_OFFEN;
1192 case AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFSET:
1193 return AMDGPU::BUFFER_LOAD_SBYTE_D16_HI_OFFEN;
1194 case AMDGPU::BUFFER_LOAD_SHORT_D16_OFFSET:
1195 return AMDGPU::BUFFER_LOAD_SHORT_D16_OFFEN;
1196 case AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFSET:
1197 return AMDGPU::BUFFER_LOAD_SHORT_D16_HI_OFFEN;
1198 default:
1199 return -1;
1200 }
1201 }
1202
spillVGPRtoAGPR(const GCNSubtarget & ST,MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,int Index,unsigned Lane,unsigned ValueReg,bool IsKill)1203 static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
1204 MachineBasicBlock &MBB,
1205 MachineBasicBlock::iterator MI,
1206 int Index, unsigned Lane,
1207 unsigned ValueReg, bool IsKill) {
1208 MachineFunction *MF = MBB.getParent();
1209 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
1210 const SIInstrInfo *TII = ST.getInstrInfo();
1211
1212 MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
1213
1214 if (Reg == AMDGPU::NoRegister)
1215 return MachineInstrBuilder();
1216
1217 bool IsStore = MI->mayStore();
1218 MachineRegisterInfo &MRI = MF->getRegInfo();
1219 auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
1220
1221 unsigned Dst = IsStore ? Reg : ValueReg;
1222 unsigned Src = IsStore ? ValueReg : Reg;
1223 bool IsVGPR = TRI->isVGPR(MRI, Reg);
1224 DebugLoc DL = MI->getDebugLoc();
1225 if (IsVGPR == TRI->isVGPR(MRI, ValueReg)) {
1226 // Spiller during regalloc may restore a spilled register to its superclass.
1227 // It could result in AGPR spills restored to VGPRs or the other way around,
1228 // making the src and dst with identical regclasses at this point. It just
1229 // needs a copy in such cases.
1230 auto CopyMIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::COPY), Dst)
1231 .addReg(Src, getKillRegState(IsKill));
1232 CopyMIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1233 return CopyMIB;
1234 }
1235 unsigned Opc = (IsStore ^ IsVGPR) ? AMDGPU::V_ACCVGPR_WRITE_B32_e64
1236 : AMDGPU::V_ACCVGPR_READ_B32_e64;
1237
1238 auto MIB = BuildMI(MBB, MI, DL, TII->get(Opc), Dst)
1239 .addReg(Src, getKillRegState(IsKill));
1240 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1241 return MIB;
1242 }
1243
1244 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
1245 // need to handle the case where an SGPR may need to be spilled while spilling.
buildMUBUFOffsetLoadStore(const GCNSubtarget & ST,MachineFrameInfo & MFI,MachineBasicBlock::iterator MI,int Index,int64_t Offset)1246 static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
1247 MachineFrameInfo &MFI,
1248 MachineBasicBlock::iterator MI,
1249 int Index,
1250 int64_t Offset) {
1251 const SIInstrInfo *TII = ST.getInstrInfo();
1252 MachineBasicBlock *MBB = MI->getParent();
1253 const DebugLoc &DL = MI->getDebugLoc();
1254 bool IsStore = MI->mayStore();
1255
1256 unsigned Opc = MI->getOpcode();
1257 int LoadStoreOp = IsStore ?
1258 getOffsetMUBUFStore(Opc) : getOffsetMUBUFLoad(Opc);
1259 if (LoadStoreOp == -1)
1260 return false;
1261
1262 const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
1263 if (spillVGPRtoAGPR(ST, *MBB, MI, Index, 0, Reg->getReg(), false).getInstr())
1264 return true;
1265
1266 MachineInstrBuilder NewMI =
1267 BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
1268 .add(*Reg)
1269 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::srsrc))
1270 .add(*TII->getNamedOperand(*MI, AMDGPU::OpName::soffset))
1271 .addImm(Offset)
1272 .addImm(0) // cpol
1273 .addImm(0) // swz
1274 .cloneMemRefs(*MI);
1275
1276 const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
1277 AMDGPU::OpName::vdata_in);
1278 if (VDataIn)
1279 NewMI.add(*VDataIn);
1280 return true;
1281 }
1282
getFlatScratchSpillOpcode(const SIInstrInfo * TII,unsigned LoadStoreOp,unsigned EltSize)1283 static unsigned getFlatScratchSpillOpcode(const SIInstrInfo *TII,
1284 unsigned LoadStoreOp,
1285 unsigned EltSize) {
1286 bool IsStore = TII->get(LoadStoreOp).mayStore();
1287 bool HasVAddr = AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::vaddr);
1288 bool UseST =
1289 !HasVAddr && !AMDGPU::hasNamedOperand(LoadStoreOp, AMDGPU::OpName::saddr);
1290
1291 switch (EltSize) {
1292 case 4:
1293 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1294 : AMDGPU::SCRATCH_LOAD_DWORD_SADDR;
1295 break;
1296 case 8:
1297 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX2_SADDR
1298 : AMDGPU::SCRATCH_LOAD_DWORDX2_SADDR;
1299 break;
1300 case 12:
1301 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX3_SADDR
1302 : AMDGPU::SCRATCH_LOAD_DWORDX3_SADDR;
1303 break;
1304 case 16:
1305 LoadStoreOp = IsStore ? AMDGPU::SCRATCH_STORE_DWORDX4_SADDR
1306 : AMDGPU::SCRATCH_LOAD_DWORDX4_SADDR;
1307 break;
1308 default:
1309 llvm_unreachable("Unexpected spill load/store size!");
1310 }
1311
1312 if (HasVAddr)
1313 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1314 else if (UseST)
1315 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1316
1317 return LoadStoreOp;
1318 }
1319
buildSpillLoadStore(MachineBasicBlock & MBB,MachineBasicBlock::iterator MI,const DebugLoc & DL,unsigned LoadStoreOp,int Index,Register ValueReg,bool IsKill,MCRegister ScratchOffsetReg,int64_t InstOffset,MachineMemOperand * MMO,RegScavenger * RS,LiveRegUnits * LiveUnits) const1320 void SIRegisterInfo::buildSpillLoadStore(
1321 MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
1322 unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
1323 MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
1324 RegScavenger *RS, LiveRegUnits *LiveUnits) const {
1325 assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
1326
1327 MachineFunction *MF = MBB.getParent();
1328 const SIInstrInfo *TII = ST.getInstrInfo();
1329 const MachineFrameInfo &MFI = MF->getFrameInfo();
1330 const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
1331
1332 const MCInstrDesc *Desc = &TII->get(LoadStoreOp);
1333 bool IsStore = Desc->mayStore();
1334 bool IsFlat = TII->isFLATScratch(LoadStoreOp);
1335
1336 bool CanClobberSCC = false;
1337 bool Scavenged = false;
1338 MCRegister SOffset = ScratchOffsetReg;
1339
1340 const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
1341 // On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
1342 const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
1343 const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
1344
1345 // Always use 4 byte operations for AGPRs because we need to scavenge
1346 // a temporary VGPR.
1347 unsigned EltSize = (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) : 4u;
1348 unsigned NumSubRegs = RegWidth / EltSize;
1349 unsigned Size = NumSubRegs * EltSize;
1350 unsigned RemSize = RegWidth - Size;
1351 unsigned NumRemSubRegs = RemSize ? 1 : 0;
1352 int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
1353 int64_t MaterializedOffset = Offset;
1354
1355 int64_t MaxOffset = Offset + Size + RemSize - EltSize;
1356 int64_t ScratchOffsetRegDelta = 0;
1357
1358 if (IsFlat && EltSize > 4) {
1359 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1360 Desc = &TII->get(LoadStoreOp);
1361 }
1362
1363 Align Alignment = MFI.getObjectAlign(Index);
1364 const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
1365
1366 assert((IsFlat || ((Offset % EltSize) == 0)) &&
1367 "unexpected VGPR spill offset");
1368
1369 // Track a VGPR to use for a constant offset we need to materialize.
1370 Register TmpOffsetVGPR;
1371
1372 // Track a VGPR to use as an intermediate value.
1373 Register TmpIntermediateVGPR;
1374 bool UseVGPROffset = false;
1375
1376 // Materialize a VGPR offset required for the given SGPR/VGPR/Immediate
1377 // combination.
1378 auto MaterializeVOffset = [&](Register SGPRBase, Register TmpVGPR,
1379 int64_t VOffset) {
1380 // We are using a VGPR offset
1381 if (IsFlat && SGPRBase) {
1382 // We only have 1 VGPR offset, or 1 SGPR offset. We don't have a free
1383 // SGPR, so perform the add as vector.
1384 // We don't need a base SGPR in the kernel.
1385
1386 if (ST.getConstantBusLimit(AMDGPU::V_ADD_U32_e64) >= 2) {
1387 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64), TmpVGPR)
1388 .addReg(SGPRBase)
1389 .addImm(VOffset)
1390 .addImm(0); // clamp
1391 } else {
1392 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1393 .addReg(SGPRBase);
1394 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e32), TmpVGPR)
1395 .addImm(VOffset)
1396 .addReg(TmpOffsetVGPR);
1397 }
1398 } else {
1399 assert(TmpOffsetVGPR);
1400 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
1401 .addImm(VOffset);
1402 }
1403 };
1404
1405 bool IsOffsetLegal =
1406 IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
1407 SIInstrFlags::FlatScratch)
1408 : TII->isLegalMUBUFImmOffset(MaxOffset);
1409 if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
1410 SOffset = MCRegister();
1411
1412 // We don't have access to the register scavenger if this function is called
1413 // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
1414 // TODO: Clobbering SCC is not necessary for scratch instructions in the
1415 // entry.
1416 if (RS) {
1417 SOffset = RS->scavengeRegisterBackwards(AMDGPU::SGPR_32RegClass, MI, false, 0, false);
1418
1419 // Piggy back on the liveness scan we just did see if SCC is dead.
1420 CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
1421 } else if (LiveUnits) {
1422 CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
1423 for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
1424 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1425 SOffset = Reg;
1426 break;
1427 }
1428 }
1429 }
1430
1431 if (ScratchOffsetReg != AMDGPU::NoRegister && !CanClobberSCC)
1432 SOffset = Register();
1433
1434 if (!SOffset) {
1435 UseVGPROffset = true;
1436
1437 if (RS) {
1438 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
1439 } else {
1440 assert(LiveUnits);
1441 for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
1442 if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
1443 TmpOffsetVGPR = Reg;
1444 break;
1445 }
1446 }
1447 }
1448
1449 assert(TmpOffsetVGPR);
1450 } else if (!SOffset && CanClobberSCC) {
1451 // There are no free SGPRs, and since we are in the process of spilling
1452 // VGPRs too. Since we need a VGPR in order to spill SGPRs (this is true
1453 // on SI/CI and on VI it is true until we implement spilling using scalar
1454 // stores), we have no way to free up an SGPR. Our solution here is to
1455 // add the offset directly to the ScratchOffset or StackPtrOffset
1456 // register, and then subtract the offset after the spill to return the
1457 // register to it's original value.
1458
1459 // TODO: If we don't have to do an emergency stack slot spill, converting
1460 // to use the VGPR offset is fewer instructions.
1461 if (!ScratchOffsetReg)
1462 ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
1463 SOffset = ScratchOffsetReg;
1464 ScratchOffsetRegDelta = Offset;
1465 } else {
1466 Scavenged = true;
1467 }
1468
1469 // We currently only support spilling VGPRs to EltSize boundaries, meaning
1470 // we can simplify the adjustment of Offset here to just scale with
1471 // WavefrontSize.
1472 if (!IsFlat && !UseVGPROffset)
1473 Offset *= ST.getWavefrontSize();
1474
1475 if (!UseVGPROffset && !SOffset)
1476 report_fatal_error("could not scavenge SGPR to spill in entry function");
1477
1478 if (UseVGPROffset) {
1479 // We are using a VGPR offset
1480 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, Offset);
1481 } else if (ScratchOffsetReg == AMDGPU::NoRegister) {
1482 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset).addImm(Offset);
1483 } else {
1484 assert(Offset != 0);
1485 auto Add = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1486 .addReg(ScratchOffsetReg)
1487 .addImm(Offset);
1488 Add->getOperand(3).setIsDead(); // Mark SCC as dead.
1489 }
1490
1491 Offset = 0;
1492 }
1493
1494 if (IsFlat && SOffset == AMDGPU::NoRegister) {
1495 assert(AMDGPU::getNamedOperandIdx(LoadStoreOp, AMDGPU::OpName::vaddr) < 0
1496 && "Unexpected vaddr for flat scratch with a FI operand");
1497
1498 if (UseVGPROffset) {
1499 LoadStoreOp = AMDGPU::getFlatScratchInstSVfromSS(LoadStoreOp);
1500 } else {
1501 assert(ST.hasFlatScratchSTMode());
1502 LoadStoreOp = AMDGPU::getFlatScratchInstSTfromSS(LoadStoreOp);
1503 }
1504
1505 Desc = &TII->get(LoadStoreOp);
1506 }
1507
1508 for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e;
1509 ++i, RegOffset += EltSize) {
1510 if (i == NumSubRegs) {
1511 EltSize = RemSize;
1512 LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize);
1513 }
1514 Desc = &TII->get(LoadStoreOp);
1515
1516 if (!IsFlat && UseVGPROffset) {
1517 int NewLoadStoreOp = IsStore ? getOffenMUBUFStore(LoadStoreOp)
1518 : getOffenMUBUFLoad(LoadStoreOp);
1519 Desc = &TII->get(NewLoadStoreOp);
1520 }
1521
1522 if (UseVGPROffset && TmpOffsetVGPR == TmpIntermediateVGPR) {
1523 // If we are spilling an AGPR beyond the range of the memory instruction
1524 // offset and need to use a VGPR offset, we ideally have at least 2
1525 // scratch VGPRs. If we don't have a second free VGPR without spilling,
1526 // recycle the VGPR used for the offset which requires resetting after
1527 // each subregister.
1528
1529 MaterializeVOffset(ScratchOffsetReg, TmpOffsetVGPR, MaterializedOffset);
1530 }
1531
1532 unsigned NumRegs = EltSize / 4;
1533 Register SubReg = e == 1
1534 ? ValueReg
1535 : Register(getSubReg(ValueReg,
1536 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1537
1538 unsigned SOffsetRegState = 0;
1539 unsigned SrcDstRegState = getDefRegState(!IsStore);
1540 const bool IsLastSubReg = i + 1 == e;
1541 const bool IsFirstSubReg = i == 0;
1542 if (IsLastSubReg) {
1543 SOffsetRegState |= getKillRegState(Scavenged);
1544 // The last implicit use carries the "Kill" flag.
1545 SrcDstRegState |= getKillRegState(IsKill);
1546 }
1547
1548 // Make sure the whole register is defined if there are undef components by
1549 // adding an implicit def of the super-reg on the first instruction.
1550 bool NeedSuperRegDef = e > 1 && IsStore && IsFirstSubReg;
1551 bool NeedSuperRegImpOperand = e > 1;
1552
1553 // Remaining element size to spill into memory after some parts of it
1554 // spilled into either AGPRs or VGPRs.
1555 unsigned RemEltSize = EltSize;
1556
1557 // AGPRs to spill VGPRs and vice versa are allocated in a reverse order,
1558 // starting from the last lane. In case if a register cannot be completely
1559 // spilled into another register that will ensure its alignment does not
1560 // change. For targets with VGPR alignment requirement this is important
1561 // in case of flat scratch usage as we might get a scratch_load or
1562 // scratch_store of an unaligned register otherwise.
1563 for (int LaneS = (RegOffset + EltSize) / 4 - 1, Lane = LaneS,
1564 LaneE = RegOffset / 4;
1565 Lane >= LaneE; --Lane) {
1566 bool IsSubReg = e > 1 || EltSize > 4;
1567 Register Sub = IsSubReg
1568 ? Register(getSubReg(ValueReg, getSubRegFromChannel(Lane)))
1569 : ValueReg;
1570 auto MIB = spillVGPRtoAGPR(ST, MBB, MI, Index, Lane, Sub, IsKill);
1571 if (!MIB.getInstr())
1572 break;
1573 if (NeedSuperRegDef || (IsSubReg && IsStore && Lane == LaneS && IsFirstSubReg)) {
1574 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1575 NeedSuperRegDef = false;
1576 }
1577 if ((IsSubReg || NeedSuperRegImpOperand) && (IsFirstSubReg || IsLastSubReg)) {
1578 NeedSuperRegImpOperand = true;
1579 unsigned State = SrcDstRegState;
1580 if (!IsLastSubReg || (Lane != LaneE))
1581 State &= ~RegState::Kill;
1582 if (!IsFirstSubReg || (Lane != LaneS))
1583 State &= ~RegState::Define;
1584 MIB.addReg(ValueReg, RegState::Implicit | State);
1585 }
1586 RemEltSize -= 4;
1587 }
1588
1589 if (!RemEltSize) // Fully spilled into AGPRs.
1590 continue;
1591
1592 if (RemEltSize != EltSize) { // Partially spilled to AGPRs
1593 assert(IsFlat && EltSize > 4);
1594
1595 unsigned NumRegs = RemEltSize / 4;
1596 SubReg = Register(getSubReg(ValueReg,
1597 getSubRegFromChannel(RegOffset / 4, NumRegs)));
1598 unsigned Opc = getFlatScratchSpillOpcode(TII, LoadStoreOp, RemEltSize);
1599 Desc = &TII->get(Opc);
1600 }
1601
1602 unsigned FinalReg = SubReg;
1603
1604 if (IsAGPR) {
1605 assert(EltSize == 4);
1606
1607 if (!TmpIntermediateVGPR) {
1608 TmpIntermediateVGPR = FuncInfo->getVGPRForAGPRCopy();
1609 assert(MF->getRegInfo().isReserved(TmpIntermediateVGPR));
1610 }
1611 if (IsStore) {
1612 auto AccRead = BuildMI(MBB, MI, DL,
1613 TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64),
1614 TmpIntermediateVGPR)
1615 .addReg(SubReg, getKillRegState(IsKill));
1616 if (NeedSuperRegDef)
1617 AccRead.addReg(ValueReg, RegState::ImplicitDefine);
1618 AccRead->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1619 }
1620 SubReg = TmpIntermediateVGPR;
1621 } else if (UseVGPROffset) {
1622 // FIXME: change to scavengeRegisterBackwards()
1623 if (!TmpOffsetVGPR) {
1624 TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
1625 MI, false, 0);
1626 RS->setRegUsed(TmpOffsetVGPR);
1627 }
1628 }
1629
1630 MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(RegOffset);
1631 MachineMemOperand *NewMMO =
1632 MF->getMachineMemOperand(PInfo, MMO->getFlags(), RemEltSize,
1633 commonAlignment(Alignment, RegOffset));
1634
1635 auto MIB =
1636 BuildMI(MBB, MI, DL, *Desc)
1637 .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill));
1638
1639 if (UseVGPROffset) {
1640 // For an AGPR spill, we reuse the same temp VGPR for the offset and the
1641 // intermediate accvgpr_write.
1642 MIB.addReg(TmpOffsetVGPR, getKillRegState(IsLastSubReg && !IsAGPR));
1643 }
1644
1645 if (!IsFlat)
1646 MIB.addReg(FuncInfo->getScratchRSrcReg());
1647
1648 if (SOffset == AMDGPU::NoRegister) {
1649 if (!IsFlat) {
1650 if (UseVGPROffset && ScratchOffsetReg) {
1651 MIB.addReg(ScratchOffsetReg);
1652 } else {
1653 assert(FuncInfo->isBottomOfStack());
1654 MIB.addImm(0);
1655 }
1656 }
1657 } else {
1658 MIB.addReg(SOffset, SOffsetRegState);
1659 }
1660
1661 MIB.addImm(Offset + RegOffset);
1662
1663 bool LastUse = MMO->getFlags() & MOLastUse;
1664 MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
1665
1666 if (!IsFlat)
1667 MIB.addImm(0); // swz
1668 MIB.addMemOperand(NewMMO);
1669
1670 if (!IsAGPR && NeedSuperRegDef)
1671 MIB.addReg(ValueReg, RegState::ImplicitDefine);
1672
1673 if (!IsStore && IsAGPR && TmpIntermediateVGPR != AMDGPU::NoRegister) {
1674 MIB = BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64),
1675 FinalReg)
1676 .addReg(TmpIntermediateVGPR, RegState::Kill);
1677 MIB->setAsmPrinterFlag(MachineInstr::ReloadReuse);
1678 }
1679
1680 if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
1681 MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
1682
1683 // The epilog restore of a wwm-scratch register can cause undesired
1684 // optimization during machine-cp post PrologEpilogInserter if the same
1685 // register was assigned for return value ABI lowering with a COPY
1686 // instruction. As given below, with the epilog reload, the earlier COPY
1687 // appeared to be dead during machine-cp.
1688 // ...
1689 // v0 in WWM operation, needs the WWM spill at prolog/epilog.
1690 // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
1691 // ...
1692 // Epilog block:
1693 // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
1694 // ...
1695 // WWM spill restore to preserve the inactive lanes of v0.
1696 // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
1697 // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
1698 // $exec = S_MOV_B64 killed $sgpr4_sgpr5
1699 // ...
1700 // SI_RETURN implicit $vgpr0
1701 // ...
1702 // To fix it, mark the same reg as a tied op for such restore instructions
1703 // so that it marks a usage for the preceding COPY.
1704 if (!IsStore && MI != MBB.end() && MI->isReturn() &&
1705 MI->readsRegister(SubReg, this)) {
1706 MIB.addReg(SubReg, RegState::Implicit);
1707 MIB->tieOperands(0, MIB->getNumOperands() - 1);
1708 }
1709 }
1710
1711 if (ScratchOffsetRegDelta != 0) {
1712 // Subtract the offset we added to the ScratchOffset register.
1713 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), SOffset)
1714 .addReg(SOffset)
1715 .addImm(-ScratchOffsetRegDelta);
1716 }
1717 }
1718
buildVGPRSpillLoadStore(SGPRSpillBuilder & SB,int Index,int Offset,bool IsLoad,bool IsKill) const1719 void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
1720 int Offset, bool IsLoad,
1721 bool IsKill) const {
1722 // Load/store VGPR
1723 MachineFrameInfo &FrameInfo = SB.MF.getFrameInfo();
1724 assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
1725
1726 Register FrameReg =
1727 FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(SB.MF)
1728 ? getBaseRegister()
1729 : getFrameRegister(SB.MF);
1730
1731 Align Alignment = FrameInfo.getObjectAlign(Index);
1732 MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(SB.MF, Index);
1733 MachineMemOperand *MMO = SB.MF.getMachineMemOperand(
1734 PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
1735 SB.EltSize, Alignment);
1736
1737 if (IsLoad) {
1738 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
1739 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
1740 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, false,
1741 FrameReg, Offset * SB.EltSize, MMO, SB.RS);
1742 } else {
1743 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
1744 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
1745 buildSpillLoadStore(*SB.MBB, SB.MI, SB.DL, Opc, Index, SB.TmpVGPR, IsKill,
1746 FrameReg, Offset * SB.EltSize, MMO, SB.RS);
1747 // This only ever adds one VGPR spill
1748 SB.MFI.addToSpilledVGPRs(1);
1749 }
1750 }
1751
spillSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool OnlyToVGPR,bool SpillToPhysVGPRLane) const1752 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
1753 RegScavenger *RS, SlotIndexes *Indexes,
1754 LiveIntervals *LIS, bool OnlyToVGPR,
1755 bool SpillToPhysVGPRLane) const {
1756 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1757
1758 ArrayRef<SpilledReg> VGPRSpills =
1759 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1760 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
1761 bool SpillToVGPR = !VGPRSpills.empty();
1762 if (OnlyToVGPR && !SpillToVGPR)
1763 return false;
1764
1765 assert(SpillToVGPR || (SB.SuperReg != SB.MFI.getStackPtrOffsetReg() &&
1766 SB.SuperReg != SB.MFI.getFrameOffsetReg()));
1767
1768 if (SpillToVGPR) {
1769
1770 assert(SB.NumSubRegs == VGPRSpills.size() &&
1771 "Num of VGPR lanes should be equal to num of SGPRs spilled");
1772
1773 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1774 Register SubReg =
1775 SB.NumSubRegs == 1
1776 ? SB.SuperReg
1777 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1778 SpilledReg Spill = VGPRSpills[i];
1779
1780 bool IsFirstSubreg = i == 0;
1781 bool IsLastSubreg = i == SB.NumSubRegs - 1;
1782 bool UseKill = SB.IsKill && IsLastSubreg;
1783
1784
1785 // Mark the "old value of vgpr" input undef only if this is the first sgpr
1786 // spill to this specific vgpr in the first basic block.
1787 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1788 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
1789 .addReg(SubReg, getKillRegState(UseKill))
1790 .addImm(Spill.Lane)
1791 .addReg(Spill.VGPR);
1792 if (Indexes) {
1793 if (IsFirstSubreg)
1794 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1795 else
1796 Indexes->insertMachineInstrInMaps(*MIB);
1797 }
1798
1799 if (IsFirstSubreg && SB.NumSubRegs > 1) {
1800 // We may be spilling a super-register which is only partially defined,
1801 // and need to ensure later spills think the value is defined.
1802 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1803 }
1804
1805 if (SB.NumSubRegs > 1 && (IsFirstSubreg || IsLastSubreg))
1806 MIB.addReg(SB.SuperReg, getKillRegState(UseKill) | RegState::Implicit);
1807
1808 // FIXME: Since this spills to another register instead of an actual
1809 // frame index, we should delete the frame index when all references to
1810 // it are fixed.
1811 }
1812 } else {
1813 SB.prepare();
1814
1815 // SubReg carries the "Kill" flag when SubReg == SB.SuperReg.
1816 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
1817
1818 // Per VGPR helper data
1819 auto PVD = SB.getPerVGPRData();
1820
1821 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1822 unsigned TmpVGPRFlags = RegState::Undef;
1823
1824 // Write sub registers into the VGPR
1825 for (unsigned i = Offset * PVD.PerVGPR,
1826 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1827 i < e; ++i) {
1828 Register SubReg =
1829 SB.NumSubRegs == 1
1830 ? SB.SuperReg
1831 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1832
1833 MachineInstrBuilder WriteLane =
1834 BuildMI(*SB.MBB, MI, SB.DL,
1835 SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
1836 .addReg(SubReg, SubKillState)
1837 .addImm(i % PVD.PerVGPR)
1838 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1839 TmpVGPRFlags = 0;
1840
1841 if (Indexes) {
1842 if (i == 0)
1843 Indexes->replaceMachineInstrInMaps(*MI, *WriteLane);
1844 else
1845 Indexes->insertMachineInstrInMaps(*WriteLane);
1846 }
1847
1848 // There could be undef components of a spilled super register.
1849 // TODO: Can we detect this and skip the spill?
1850 if (SB.NumSubRegs > 1) {
1851 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1852 unsigned SuperKillState = 0;
1853 if (i + 1 == SB.NumSubRegs)
1854 SuperKillState |= getKillRegState(SB.IsKill);
1855 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
1856 }
1857 }
1858
1859 // Write out VGPR
1860 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ false);
1861 }
1862
1863 SB.restore();
1864 }
1865
1866 MI->eraseFromParent();
1867 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
1868
1869 if (LIS)
1870 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
1871
1872 return true;
1873 }
1874
restoreSGPR(MachineBasicBlock::iterator MI,int Index,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool OnlyToVGPR,bool SpillToPhysVGPRLane) const1875 bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
1876 RegScavenger *RS, SlotIndexes *Indexes,
1877 LiveIntervals *LIS, bool OnlyToVGPR,
1878 bool SpillToPhysVGPRLane) const {
1879 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
1880
1881 ArrayRef<SpilledReg> VGPRSpills =
1882 SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
1883 : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
1884 bool SpillToVGPR = !VGPRSpills.empty();
1885 if (OnlyToVGPR && !SpillToVGPR)
1886 return false;
1887
1888 if (SpillToVGPR) {
1889 for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
1890 Register SubReg =
1891 SB.NumSubRegs == 1
1892 ? SB.SuperReg
1893 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1894
1895 SpilledReg Spill = VGPRSpills[i];
1896 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1897 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1898 .addReg(Spill.VGPR)
1899 .addImm(Spill.Lane);
1900 if (SB.NumSubRegs > 1 && i == 0)
1901 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1902 if (Indexes) {
1903 if (i == e - 1)
1904 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1905 else
1906 Indexes->insertMachineInstrInMaps(*MIB);
1907 }
1908 }
1909 } else {
1910 SB.prepare();
1911
1912 // Per VGPR helper data
1913 auto PVD = SB.getPerVGPRData();
1914
1915 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1916 // Load in VGPR data
1917 SB.readWriteTmpVGPR(Offset, /*IsLoad*/ true);
1918
1919 // Unpack lanes
1920 for (unsigned i = Offset * PVD.PerVGPR,
1921 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1922 i < e; ++i) {
1923 Register SubReg =
1924 SB.NumSubRegs == 1
1925 ? SB.SuperReg
1926 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1927
1928 bool LastSubReg = (i + 1 == e);
1929 auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
1930 SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
1931 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
1932 .addImm(i);
1933 if (SB.NumSubRegs > 1 && i == 0)
1934 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
1935 if (Indexes) {
1936 if (i == e - 1)
1937 Indexes->replaceMachineInstrInMaps(*MI, *MIB);
1938 else
1939 Indexes->insertMachineInstrInMaps(*MIB);
1940 }
1941 }
1942 }
1943
1944 SB.restore();
1945 }
1946
1947 MI->eraseFromParent();
1948
1949 if (LIS)
1950 LIS->removeAllRegUnitsForPhysReg(SB.SuperReg);
1951
1952 return true;
1953 }
1954
spillEmergencySGPR(MachineBasicBlock::iterator MI,MachineBasicBlock & RestoreMBB,Register SGPR,RegScavenger * RS) const1955 bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
1956 MachineBasicBlock &RestoreMBB,
1957 Register SGPR, RegScavenger *RS) const {
1958 SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, SGPR, false, 0,
1959 RS);
1960 SB.prepare();
1961 // Generate the spill of SGPR to SB.TmpVGPR.
1962 unsigned SubKillState = getKillRegState((SB.NumSubRegs == 1) && SB.IsKill);
1963 auto PVD = SB.getPerVGPRData();
1964 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
1965 unsigned TmpVGPRFlags = RegState::Undef;
1966 // Write sub registers into the VGPR
1967 for (unsigned i = Offset * PVD.PerVGPR,
1968 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
1969 i < e; ++i) {
1970 Register SubReg =
1971 SB.NumSubRegs == 1
1972 ? SB.SuperReg
1973 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
1974
1975 MachineInstrBuilder WriteLane =
1976 BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
1977 SB.TmpVGPR)
1978 .addReg(SubReg, SubKillState)
1979 .addImm(i % PVD.PerVGPR)
1980 .addReg(SB.TmpVGPR, TmpVGPRFlags);
1981 TmpVGPRFlags = 0;
1982 // There could be undef components of a spilled super register.
1983 // TODO: Can we detect this and skip the spill?
1984 if (SB.NumSubRegs > 1) {
1985 // The last implicit use of the SB.SuperReg carries the "Kill" flag.
1986 unsigned SuperKillState = 0;
1987 if (i + 1 == SB.NumSubRegs)
1988 SuperKillState |= getKillRegState(SB.IsKill);
1989 WriteLane.addReg(SB.SuperReg, RegState::Implicit | SuperKillState);
1990 }
1991 }
1992 // Don't need to write VGPR out.
1993 }
1994
1995 // Restore clobbered registers in the specified restore block.
1996 MI = RestoreMBB.end();
1997 SB.setMI(&RestoreMBB, MI);
1998 // Generate the restore of SGPR from SB.TmpVGPR.
1999 for (unsigned Offset = 0; Offset < PVD.NumVGPRs; ++Offset) {
2000 // Don't need to load VGPR in.
2001 // Unpack lanes
2002 for (unsigned i = Offset * PVD.PerVGPR,
2003 e = std::min((Offset + 1) * PVD.PerVGPR, SB.NumSubRegs);
2004 i < e; ++i) {
2005 Register SubReg =
2006 SB.NumSubRegs == 1
2007 ? SB.SuperReg
2008 : Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
2009 bool LastSubReg = (i + 1 == e);
2010 auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
2011 SubReg)
2012 .addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
2013 .addImm(i);
2014 if (SB.NumSubRegs > 1 && i == 0)
2015 MIB.addReg(SB.SuperReg, RegState::ImplicitDefine);
2016 }
2017 }
2018 SB.restore();
2019
2020 SB.MFI.addToSpilledSGPRs(SB.NumSubRegs);
2021 return false;
2022 }
2023
2024 /// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
2025 /// a VGPR and the stack slot can be safely eliminated when all other users are
2026 /// handled.
eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,int FI,RegScavenger * RS,SlotIndexes * Indexes,LiveIntervals * LIS,bool SpillToPhysVGPRLane) const2027 bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
2028 MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
2029 SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
2030 switch (MI->getOpcode()) {
2031 case AMDGPU::SI_SPILL_S1024_SAVE:
2032 case AMDGPU::SI_SPILL_S512_SAVE:
2033 case AMDGPU::SI_SPILL_S384_SAVE:
2034 case AMDGPU::SI_SPILL_S352_SAVE:
2035 case AMDGPU::SI_SPILL_S320_SAVE:
2036 case AMDGPU::SI_SPILL_S288_SAVE:
2037 case AMDGPU::SI_SPILL_S256_SAVE:
2038 case AMDGPU::SI_SPILL_S224_SAVE:
2039 case AMDGPU::SI_SPILL_S192_SAVE:
2040 case AMDGPU::SI_SPILL_S160_SAVE:
2041 case AMDGPU::SI_SPILL_S128_SAVE:
2042 case AMDGPU::SI_SPILL_S96_SAVE:
2043 case AMDGPU::SI_SPILL_S64_SAVE:
2044 case AMDGPU::SI_SPILL_S32_SAVE:
2045 return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2046 case AMDGPU::SI_SPILL_S1024_RESTORE:
2047 case AMDGPU::SI_SPILL_S512_RESTORE:
2048 case AMDGPU::SI_SPILL_S384_RESTORE:
2049 case AMDGPU::SI_SPILL_S352_RESTORE:
2050 case AMDGPU::SI_SPILL_S320_RESTORE:
2051 case AMDGPU::SI_SPILL_S288_RESTORE:
2052 case AMDGPU::SI_SPILL_S256_RESTORE:
2053 case AMDGPU::SI_SPILL_S224_RESTORE:
2054 case AMDGPU::SI_SPILL_S192_RESTORE:
2055 case AMDGPU::SI_SPILL_S160_RESTORE:
2056 case AMDGPU::SI_SPILL_S128_RESTORE:
2057 case AMDGPU::SI_SPILL_S96_RESTORE:
2058 case AMDGPU::SI_SPILL_S64_RESTORE:
2059 case AMDGPU::SI_SPILL_S32_RESTORE:
2060 return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
2061 default:
2062 llvm_unreachable("not an SGPR spill instruction");
2063 }
2064 }
2065
eliminateFrameIndex(MachineBasicBlock::iterator MI,int SPAdj,unsigned FIOperandNum,RegScavenger * RS) const2066 bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
2067 int SPAdj, unsigned FIOperandNum,
2068 RegScavenger *RS) const {
2069 MachineFunction *MF = MI->getParent()->getParent();
2070 MachineBasicBlock *MBB = MI->getParent();
2071 SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
2072 MachineFrameInfo &FrameInfo = MF->getFrameInfo();
2073 const SIInstrInfo *TII = ST.getInstrInfo();
2074 DebugLoc DL = MI->getDebugLoc();
2075
2076 assert(SPAdj == 0 && "unhandled SP adjustment in call sequence?");
2077
2078 MachineOperand &FIOp = MI->getOperand(FIOperandNum);
2079 int Index = MI->getOperand(FIOperandNum).getIndex();
2080
2081 Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
2082 ? getBaseRegister()
2083 : getFrameRegister(*MF);
2084
2085 switch (MI->getOpcode()) {
2086 // SGPR register spill
2087 case AMDGPU::SI_SPILL_S1024_SAVE:
2088 case AMDGPU::SI_SPILL_S512_SAVE:
2089 case AMDGPU::SI_SPILL_S384_SAVE:
2090 case AMDGPU::SI_SPILL_S352_SAVE:
2091 case AMDGPU::SI_SPILL_S320_SAVE:
2092 case AMDGPU::SI_SPILL_S288_SAVE:
2093 case AMDGPU::SI_SPILL_S256_SAVE:
2094 case AMDGPU::SI_SPILL_S224_SAVE:
2095 case AMDGPU::SI_SPILL_S192_SAVE:
2096 case AMDGPU::SI_SPILL_S160_SAVE:
2097 case AMDGPU::SI_SPILL_S128_SAVE:
2098 case AMDGPU::SI_SPILL_S96_SAVE:
2099 case AMDGPU::SI_SPILL_S64_SAVE:
2100 case AMDGPU::SI_SPILL_S32_SAVE: {
2101 return spillSGPR(MI, Index, RS);
2102 }
2103
2104 // SGPR register restore
2105 case AMDGPU::SI_SPILL_S1024_RESTORE:
2106 case AMDGPU::SI_SPILL_S512_RESTORE:
2107 case AMDGPU::SI_SPILL_S384_RESTORE:
2108 case AMDGPU::SI_SPILL_S352_RESTORE:
2109 case AMDGPU::SI_SPILL_S320_RESTORE:
2110 case AMDGPU::SI_SPILL_S288_RESTORE:
2111 case AMDGPU::SI_SPILL_S256_RESTORE:
2112 case AMDGPU::SI_SPILL_S224_RESTORE:
2113 case AMDGPU::SI_SPILL_S192_RESTORE:
2114 case AMDGPU::SI_SPILL_S160_RESTORE:
2115 case AMDGPU::SI_SPILL_S128_RESTORE:
2116 case AMDGPU::SI_SPILL_S96_RESTORE:
2117 case AMDGPU::SI_SPILL_S64_RESTORE:
2118 case AMDGPU::SI_SPILL_S32_RESTORE: {
2119 return restoreSGPR(MI, Index, RS);
2120 }
2121
2122 // VGPR register spill
2123 case AMDGPU::SI_SPILL_V1024_SAVE:
2124 case AMDGPU::SI_SPILL_V512_SAVE:
2125 case AMDGPU::SI_SPILL_V384_SAVE:
2126 case AMDGPU::SI_SPILL_V352_SAVE:
2127 case AMDGPU::SI_SPILL_V320_SAVE:
2128 case AMDGPU::SI_SPILL_V288_SAVE:
2129 case AMDGPU::SI_SPILL_V256_SAVE:
2130 case AMDGPU::SI_SPILL_V224_SAVE:
2131 case AMDGPU::SI_SPILL_V192_SAVE:
2132 case AMDGPU::SI_SPILL_V160_SAVE:
2133 case AMDGPU::SI_SPILL_V128_SAVE:
2134 case AMDGPU::SI_SPILL_V96_SAVE:
2135 case AMDGPU::SI_SPILL_V64_SAVE:
2136 case AMDGPU::SI_SPILL_V32_SAVE:
2137 case AMDGPU::SI_SPILL_A1024_SAVE:
2138 case AMDGPU::SI_SPILL_A512_SAVE:
2139 case AMDGPU::SI_SPILL_A384_SAVE:
2140 case AMDGPU::SI_SPILL_A352_SAVE:
2141 case AMDGPU::SI_SPILL_A320_SAVE:
2142 case AMDGPU::SI_SPILL_A288_SAVE:
2143 case AMDGPU::SI_SPILL_A256_SAVE:
2144 case AMDGPU::SI_SPILL_A224_SAVE:
2145 case AMDGPU::SI_SPILL_A192_SAVE:
2146 case AMDGPU::SI_SPILL_A160_SAVE:
2147 case AMDGPU::SI_SPILL_A128_SAVE:
2148 case AMDGPU::SI_SPILL_A96_SAVE:
2149 case AMDGPU::SI_SPILL_A64_SAVE:
2150 case AMDGPU::SI_SPILL_A32_SAVE:
2151 case AMDGPU::SI_SPILL_AV1024_SAVE:
2152 case AMDGPU::SI_SPILL_AV512_SAVE:
2153 case AMDGPU::SI_SPILL_AV384_SAVE:
2154 case AMDGPU::SI_SPILL_AV352_SAVE:
2155 case AMDGPU::SI_SPILL_AV320_SAVE:
2156 case AMDGPU::SI_SPILL_AV288_SAVE:
2157 case AMDGPU::SI_SPILL_AV256_SAVE:
2158 case AMDGPU::SI_SPILL_AV224_SAVE:
2159 case AMDGPU::SI_SPILL_AV192_SAVE:
2160 case AMDGPU::SI_SPILL_AV160_SAVE:
2161 case AMDGPU::SI_SPILL_AV128_SAVE:
2162 case AMDGPU::SI_SPILL_AV96_SAVE:
2163 case AMDGPU::SI_SPILL_AV64_SAVE:
2164 case AMDGPU::SI_SPILL_AV32_SAVE:
2165 case AMDGPU::SI_SPILL_WWM_V32_SAVE:
2166 case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
2167 const MachineOperand *VData = TII->getNamedOperand(*MI,
2168 AMDGPU::OpName::vdata);
2169 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2170 MFI->getStackPtrOffsetReg());
2171
2172 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
2173 : AMDGPU::BUFFER_STORE_DWORD_OFFSET;
2174 auto *MBB = MI->getParent();
2175 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2176 if (IsWWMRegSpill) {
2177 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2178 RS->isRegUsed(AMDGPU::SCC));
2179 }
2180 buildSpillLoadStore(
2181 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2182 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2183 *MI->memoperands_begin(), RS);
2184 MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
2185 if (IsWWMRegSpill)
2186 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2187
2188 MI->eraseFromParent();
2189 return true;
2190 }
2191 case AMDGPU::SI_SPILL_V32_RESTORE:
2192 case AMDGPU::SI_SPILL_V64_RESTORE:
2193 case AMDGPU::SI_SPILL_V96_RESTORE:
2194 case AMDGPU::SI_SPILL_V128_RESTORE:
2195 case AMDGPU::SI_SPILL_V160_RESTORE:
2196 case AMDGPU::SI_SPILL_V192_RESTORE:
2197 case AMDGPU::SI_SPILL_V224_RESTORE:
2198 case AMDGPU::SI_SPILL_V256_RESTORE:
2199 case AMDGPU::SI_SPILL_V288_RESTORE:
2200 case AMDGPU::SI_SPILL_V320_RESTORE:
2201 case AMDGPU::SI_SPILL_V352_RESTORE:
2202 case AMDGPU::SI_SPILL_V384_RESTORE:
2203 case AMDGPU::SI_SPILL_V512_RESTORE:
2204 case AMDGPU::SI_SPILL_V1024_RESTORE:
2205 case AMDGPU::SI_SPILL_A32_RESTORE:
2206 case AMDGPU::SI_SPILL_A64_RESTORE:
2207 case AMDGPU::SI_SPILL_A96_RESTORE:
2208 case AMDGPU::SI_SPILL_A128_RESTORE:
2209 case AMDGPU::SI_SPILL_A160_RESTORE:
2210 case AMDGPU::SI_SPILL_A192_RESTORE:
2211 case AMDGPU::SI_SPILL_A224_RESTORE:
2212 case AMDGPU::SI_SPILL_A256_RESTORE:
2213 case AMDGPU::SI_SPILL_A288_RESTORE:
2214 case AMDGPU::SI_SPILL_A320_RESTORE:
2215 case AMDGPU::SI_SPILL_A352_RESTORE:
2216 case AMDGPU::SI_SPILL_A384_RESTORE:
2217 case AMDGPU::SI_SPILL_A512_RESTORE:
2218 case AMDGPU::SI_SPILL_A1024_RESTORE:
2219 case AMDGPU::SI_SPILL_AV32_RESTORE:
2220 case AMDGPU::SI_SPILL_AV64_RESTORE:
2221 case AMDGPU::SI_SPILL_AV96_RESTORE:
2222 case AMDGPU::SI_SPILL_AV128_RESTORE:
2223 case AMDGPU::SI_SPILL_AV160_RESTORE:
2224 case AMDGPU::SI_SPILL_AV192_RESTORE:
2225 case AMDGPU::SI_SPILL_AV224_RESTORE:
2226 case AMDGPU::SI_SPILL_AV256_RESTORE:
2227 case AMDGPU::SI_SPILL_AV288_RESTORE:
2228 case AMDGPU::SI_SPILL_AV320_RESTORE:
2229 case AMDGPU::SI_SPILL_AV352_RESTORE:
2230 case AMDGPU::SI_SPILL_AV384_RESTORE:
2231 case AMDGPU::SI_SPILL_AV512_RESTORE:
2232 case AMDGPU::SI_SPILL_AV1024_RESTORE:
2233 case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
2234 case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
2235 const MachineOperand *VData = TII->getNamedOperand(*MI,
2236 AMDGPU::OpName::vdata);
2237 assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
2238 MFI->getStackPtrOffsetReg());
2239
2240 unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
2241 : AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
2242 auto *MBB = MI->getParent();
2243 bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
2244 if (IsWWMRegSpill) {
2245 TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
2246 RS->isRegUsed(AMDGPU::SCC));
2247 }
2248
2249 buildSpillLoadStore(
2250 *MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
2251 TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
2252 *MI->memoperands_begin(), RS);
2253
2254 if (IsWWMRegSpill)
2255 TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
2256
2257 MI->eraseFromParent();
2258 return true;
2259 }
2260
2261 default: {
2262 // Other access to frame index
2263 const DebugLoc &DL = MI->getDebugLoc();
2264
2265 int64_t Offset = FrameInfo.getObjectOffset(Index);
2266 if (ST.enableFlatScratch()) {
2267 if (TII->isFLATScratch(*MI)) {
2268 assert((int16_t)FIOperandNum ==
2269 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2270 AMDGPU::OpName::saddr));
2271
2272 // The offset is always swizzled, just replace it
2273 if (FrameReg)
2274 FIOp.ChangeToRegister(FrameReg, false);
2275
2276 if (!Offset)
2277 return false;
2278
2279 MachineOperand *OffsetOp =
2280 TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
2281 int64_t NewOffset = Offset + OffsetOp->getImm();
2282 if (TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
2283 SIInstrFlags::FlatScratch)) {
2284 OffsetOp->setImm(NewOffset);
2285 if (FrameReg)
2286 return false;
2287 Offset = 0;
2288 }
2289
2290 if (!Offset) {
2291 unsigned Opc = MI->getOpcode();
2292 int NewOpc = -1;
2293 if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr)) {
2294 NewOpc = AMDGPU::getFlatScratchInstSVfromSVS(Opc);
2295 } else if (ST.hasFlatScratchSTMode()) {
2296 // On GFX10 we have ST mode to use no registers for an address.
2297 // Otherwise we need to materialize 0 into an SGPR.
2298 NewOpc = AMDGPU::getFlatScratchInstSTfromSS(Opc);
2299 }
2300
2301 if (NewOpc != -1) {
2302 // removeOperand doesn't fixup tied operand indexes as it goes, so
2303 // it asserts. Untie vdst_in for now and retie them afterwards.
2304 int VDstIn = AMDGPU::getNamedOperandIdx(Opc,
2305 AMDGPU::OpName::vdst_in);
2306 bool TiedVDst = VDstIn != -1 &&
2307 MI->getOperand(VDstIn).isReg() &&
2308 MI->getOperand(VDstIn).isTied();
2309 if (TiedVDst)
2310 MI->untieRegOperand(VDstIn);
2311
2312 MI->removeOperand(
2313 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::saddr));
2314
2315 if (TiedVDst) {
2316 int NewVDst =
2317 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst);
2318 int NewVDstIn =
2319 AMDGPU::getNamedOperandIdx(NewOpc, AMDGPU::OpName::vdst_in);
2320 assert (NewVDst != -1 && NewVDstIn != -1 && "Must be tied!");
2321 MI->tieOperands(NewVDst, NewVDstIn);
2322 }
2323 MI->setDesc(TII->get(NewOpc));
2324 return false;
2325 }
2326 }
2327 }
2328
2329 if (!FrameReg) {
2330 FIOp.ChangeToImmediate(Offset);
2331 if (TII->isImmOperandLegal(*MI, FIOperandNum, FIOp))
2332 return false;
2333 }
2334
2335 // We need to use register here. Check if we can use an SGPR or need
2336 // a VGPR.
2337 FIOp.ChangeToRegister(AMDGPU::M0, false);
2338 bool UseSGPR = TII->isOperandLegal(*MI, FIOperandNum, &FIOp);
2339
2340 if (!Offset && FrameReg && UseSGPR) {
2341 FIOp.setReg(FrameReg);
2342 return false;
2343 }
2344
2345 const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
2346 : &AMDGPU::VGPR_32RegClass;
2347
2348 Register TmpReg =
2349 RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
2350 FIOp.setReg(TmpReg);
2351 FIOp.setIsKill();
2352
2353 if ((!FrameReg || !Offset) && TmpReg) {
2354 unsigned Opc = UseSGPR ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
2355 auto MIB = BuildMI(*MBB, MI, DL, TII->get(Opc), TmpReg);
2356 if (FrameReg)
2357 MIB.addReg(FrameReg);
2358 else
2359 MIB.addImm(Offset);
2360
2361 return false;
2362 }
2363
2364 bool NeedSaveSCC =
2365 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC);
2366
2367 Register TmpSReg =
2368 UseSGPR ? TmpReg
2369 : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
2370 MI, false, 0, !UseSGPR);
2371
2372 // TODO: for flat scratch another attempt can be made with a VGPR index
2373 // if no SGPRs can be scavenged.
2374 if ((!TmpSReg && !FrameReg) || (!TmpReg && !UseSGPR))
2375 report_fatal_error("Cannot scavenge register in FI elimination!");
2376
2377 if (!TmpSReg) {
2378 // Use frame register and restore it after.
2379 TmpSReg = FrameReg;
2380 FIOp.setReg(FrameReg);
2381 FIOp.setIsKill(false);
2382 }
2383
2384 if (NeedSaveSCC) {
2385 assert(!(Offset & 0x1) && "Flat scratch offset must be aligned!");
2386 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADDC_U32), TmpSReg)
2387 .addReg(FrameReg)
2388 .addImm(Offset);
2389 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITCMP1_B32))
2390 .addReg(TmpSReg)
2391 .addImm(0);
2392 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_BITSET0_B32), TmpSReg)
2393 .addImm(0)
2394 .addReg(TmpSReg);
2395 } else {
2396 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), TmpSReg)
2397 .addReg(FrameReg)
2398 .addImm(Offset);
2399 }
2400
2401 if (!UseSGPR)
2402 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2403 .addReg(TmpSReg, RegState::Kill);
2404
2405 if (TmpSReg == FrameReg) {
2406 // Undo frame register modification.
2407 if (NeedSaveSCC && !MI->registerDefIsDead(AMDGPU::SCC)) {
2408 MachineBasicBlock::iterator I =
2409 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADDC_U32),
2410 TmpSReg)
2411 .addReg(FrameReg)
2412 .addImm(-Offset);
2413 I = BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITCMP1_B32))
2414 .addReg(TmpSReg)
2415 .addImm(0);
2416 BuildMI(*MBB, std::next(I), DL, TII->get(AMDGPU::S_BITSET0_B32),
2417 TmpSReg)
2418 .addImm(0)
2419 .addReg(TmpSReg);
2420 } else {
2421 BuildMI(*MBB, std::next(MI), DL, TII->get(AMDGPU::S_ADD_I32),
2422 FrameReg)
2423 .addReg(FrameReg)
2424 .addImm(-Offset);
2425 }
2426 }
2427
2428 return false;
2429 }
2430
2431 bool IsMUBUF = TII->isMUBUF(*MI);
2432
2433 if (!IsMUBUF && !MFI->isBottomOfStack()) {
2434 // Convert to a swizzled stack address by scaling by the wave size.
2435 // In an entry function/kernel the offset is already swizzled.
2436 bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
2437 bool LiveSCC =
2438 RS->isRegUsed(AMDGPU::SCC) && !MI->definesRegister(AMDGPU::SCC);
2439 const TargetRegisterClass *RC = IsSALU && !LiveSCC
2440 ? &AMDGPU::SReg_32RegClass
2441 : &AMDGPU::VGPR_32RegClass;
2442 bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
2443 MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
2444 Register ResultReg =
2445 IsCopy ? MI->getOperand(0).getReg()
2446 : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
2447
2448 int64_t Offset = FrameInfo.getObjectOffset(Index);
2449 if (Offset == 0) {
2450 unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
2451 : AMDGPU::V_LSHRREV_B32_e64;
2452 auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
2453 if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
2454 // For V_LSHRREV, the operands are reversed (the shift count goes
2455 // first).
2456 Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
2457 else
2458 Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
2459 if (IsSALU && !LiveSCC)
2460 Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
2461 if (IsSALU && LiveSCC) {
2462 Register NewDest = RS->scavengeRegisterBackwards(
2463 AMDGPU::SReg_32RegClass, Shift, false, 0);
2464 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
2465 NewDest)
2466 .addReg(ResultReg);
2467 ResultReg = NewDest;
2468 }
2469 } else {
2470 MachineInstrBuilder MIB;
2471 if (!IsSALU) {
2472 if ((MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) !=
2473 nullptr) {
2474 // Reuse ResultReg in intermediate step.
2475 Register ScaledReg = ResultReg;
2476
2477 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
2478 ScaledReg)
2479 .addImm(ST.getWavefrontSizeLog2())
2480 .addReg(FrameReg);
2481
2482 const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
2483
2484 // TODO: Fold if use instruction is another add of a constant.
2485 if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
2486 // FIXME: This can fail
2487 MIB.addImm(Offset);
2488 MIB.addReg(ScaledReg, RegState::Kill);
2489 if (!IsVOP2)
2490 MIB.addImm(0); // clamp bit
2491 } else {
2492 assert(MIB->getOpcode() == AMDGPU::V_ADD_CO_U32_e64 &&
2493 "Need to reuse carry out register");
2494
2495 // Use scavenged unused carry out as offset register.
2496 Register ConstOffsetReg;
2497 if (!isWave32)
2498 ConstOffsetReg = getSubReg(MIB.getReg(1), AMDGPU::sub0);
2499 else
2500 ConstOffsetReg = MIB.getReg(1);
2501
2502 BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
2503 .addImm(Offset);
2504 MIB.addReg(ConstOffsetReg, RegState::Kill);
2505 MIB.addReg(ScaledReg, RegState::Kill);
2506 MIB.addImm(0); // clamp bit
2507 }
2508 }
2509 }
2510 if (!MIB || IsSALU) {
2511 // We have to produce a carry out, and there isn't a free SGPR pair
2512 // for it. We can keep the whole computation on the SALU to avoid
2513 // clobbering an additional register at the cost of an extra mov.
2514
2515 // We may have 1 free scratch SGPR even though a carry out is
2516 // unavailable. Only one additional mov is needed.
2517 Register TmpScaledReg = RS->scavengeRegisterBackwards(
2518 AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
2519 Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
2520
2521 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
2522 .addReg(FrameReg)
2523 .addImm(ST.getWavefrontSizeLog2());
2524 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2525 .addReg(ScaledReg, RegState::Kill)
2526 .addImm(Offset);
2527 if (!IsSALU)
2528 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
2529 .addReg(ScaledReg, RegState::Kill);
2530 else
2531 ResultReg = ScaledReg;
2532
2533 // If there were truly no free SGPRs, we need to undo everything.
2534 if (!TmpScaledReg.isValid()) {
2535 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_I32), ScaledReg)
2536 .addReg(ScaledReg, RegState::Kill)
2537 .addImm(-Offset);
2538 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
2539 .addReg(FrameReg)
2540 .addImm(ST.getWavefrontSizeLog2());
2541 }
2542 }
2543 }
2544
2545 // Don't introduce an extra copy if we're just materializing in a mov.
2546 if (IsCopy) {
2547 MI->eraseFromParent();
2548 return true;
2549 }
2550 FIOp.ChangeToRegister(ResultReg, false, false, true);
2551 return false;
2552 }
2553
2554 if (IsMUBUF) {
2555 // Disable offen so we don't need a 0 vgpr base.
2556 assert(static_cast<int>(FIOperandNum) ==
2557 AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2558 AMDGPU::OpName::vaddr));
2559
2560 auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
2561 assert((SOffset.isImm() && SOffset.getImm() == 0));
2562
2563 if (FrameReg != AMDGPU::NoRegister)
2564 SOffset.ChangeToRegister(FrameReg, false);
2565
2566 int64_t Offset = FrameInfo.getObjectOffset(Index);
2567 int64_t OldImm
2568 = TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
2569 int64_t NewOffset = OldImm + Offset;
2570
2571 if (TII->isLegalMUBUFImmOffset(NewOffset) &&
2572 buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
2573 MI->eraseFromParent();
2574 return true;
2575 }
2576 }
2577
2578 // If the offset is simply too big, don't convert to a scratch wave offset
2579 // relative index.
2580
2581 FIOp.ChangeToImmediate(Offset);
2582 if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
2583 Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
2584 MI, false, 0);
2585 BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
2586 .addImm(Offset);
2587 FIOp.ChangeToRegister(TmpReg, false, false, true);
2588 }
2589 }
2590 }
2591 return false;
2592 }
2593
getRegAsmName(MCRegister Reg) const2594 StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
2595 return AMDGPUInstPrinter::getRegisterName(Reg);
2596 }
2597
getRegBitWidth(const TargetRegisterClass & RC)2598 unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
2599 return getRegBitWidth(RC.getID());
2600 }
2601
2602 static const TargetRegisterClass *
getAnyVGPRClassForBitWidth(unsigned BitWidth)2603 getAnyVGPRClassForBitWidth(unsigned BitWidth) {
2604 if (BitWidth == 64)
2605 return &AMDGPU::VReg_64RegClass;
2606 if (BitWidth == 96)
2607 return &AMDGPU::VReg_96RegClass;
2608 if (BitWidth == 128)
2609 return &AMDGPU::VReg_128RegClass;
2610 if (BitWidth == 160)
2611 return &AMDGPU::VReg_160RegClass;
2612 if (BitWidth == 192)
2613 return &AMDGPU::VReg_192RegClass;
2614 if (BitWidth == 224)
2615 return &AMDGPU::VReg_224RegClass;
2616 if (BitWidth == 256)
2617 return &AMDGPU::VReg_256RegClass;
2618 if (BitWidth == 288)
2619 return &AMDGPU::VReg_288RegClass;
2620 if (BitWidth == 320)
2621 return &AMDGPU::VReg_320RegClass;
2622 if (BitWidth == 352)
2623 return &AMDGPU::VReg_352RegClass;
2624 if (BitWidth == 384)
2625 return &AMDGPU::VReg_384RegClass;
2626 if (BitWidth == 512)
2627 return &AMDGPU::VReg_512RegClass;
2628 if (BitWidth == 1024)
2629 return &AMDGPU::VReg_1024RegClass;
2630
2631 return nullptr;
2632 }
2633
2634 static const TargetRegisterClass *
getAlignedVGPRClassForBitWidth(unsigned BitWidth)2635 getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
2636 if (BitWidth == 64)
2637 return &AMDGPU::VReg_64_Align2RegClass;
2638 if (BitWidth == 96)
2639 return &AMDGPU::VReg_96_Align2RegClass;
2640 if (BitWidth == 128)
2641 return &AMDGPU::VReg_128_Align2RegClass;
2642 if (BitWidth == 160)
2643 return &AMDGPU::VReg_160_Align2RegClass;
2644 if (BitWidth == 192)
2645 return &AMDGPU::VReg_192_Align2RegClass;
2646 if (BitWidth == 224)
2647 return &AMDGPU::VReg_224_Align2RegClass;
2648 if (BitWidth == 256)
2649 return &AMDGPU::VReg_256_Align2RegClass;
2650 if (BitWidth == 288)
2651 return &AMDGPU::VReg_288_Align2RegClass;
2652 if (BitWidth == 320)
2653 return &AMDGPU::VReg_320_Align2RegClass;
2654 if (BitWidth == 352)
2655 return &AMDGPU::VReg_352_Align2RegClass;
2656 if (BitWidth == 384)
2657 return &AMDGPU::VReg_384_Align2RegClass;
2658 if (BitWidth == 512)
2659 return &AMDGPU::VReg_512_Align2RegClass;
2660 if (BitWidth == 1024)
2661 return &AMDGPU::VReg_1024_Align2RegClass;
2662
2663 return nullptr;
2664 }
2665
2666 const TargetRegisterClass *
getVGPRClassForBitWidth(unsigned BitWidth) const2667 SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
2668 if (BitWidth == 1)
2669 return &AMDGPU::VReg_1RegClass;
2670 if (BitWidth == 16)
2671 return &AMDGPU::VGPR_16RegClass;
2672 if (BitWidth == 32)
2673 return &AMDGPU::VGPR_32RegClass;
2674 return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
2675 : getAnyVGPRClassForBitWidth(BitWidth);
2676 }
2677
2678 static const TargetRegisterClass *
getAnyAGPRClassForBitWidth(unsigned BitWidth)2679 getAnyAGPRClassForBitWidth(unsigned BitWidth) {
2680 if (BitWidth == 64)
2681 return &AMDGPU::AReg_64RegClass;
2682 if (BitWidth == 96)
2683 return &AMDGPU::AReg_96RegClass;
2684 if (BitWidth == 128)
2685 return &AMDGPU::AReg_128RegClass;
2686 if (BitWidth == 160)
2687 return &AMDGPU::AReg_160RegClass;
2688 if (BitWidth == 192)
2689 return &AMDGPU::AReg_192RegClass;
2690 if (BitWidth == 224)
2691 return &AMDGPU::AReg_224RegClass;
2692 if (BitWidth == 256)
2693 return &AMDGPU::AReg_256RegClass;
2694 if (BitWidth == 288)
2695 return &AMDGPU::AReg_288RegClass;
2696 if (BitWidth == 320)
2697 return &AMDGPU::AReg_320RegClass;
2698 if (BitWidth == 352)
2699 return &AMDGPU::AReg_352RegClass;
2700 if (BitWidth == 384)
2701 return &AMDGPU::AReg_384RegClass;
2702 if (BitWidth == 512)
2703 return &AMDGPU::AReg_512RegClass;
2704 if (BitWidth == 1024)
2705 return &AMDGPU::AReg_1024RegClass;
2706
2707 return nullptr;
2708 }
2709
2710 static const TargetRegisterClass *
getAlignedAGPRClassForBitWidth(unsigned BitWidth)2711 getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
2712 if (BitWidth == 64)
2713 return &AMDGPU::AReg_64_Align2RegClass;
2714 if (BitWidth == 96)
2715 return &AMDGPU::AReg_96_Align2RegClass;
2716 if (BitWidth == 128)
2717 return &AMDGPU::AReg_128_Align2RegClass;
2718 if (BitWidth == 160)
2719 return &AMDGPU::AReg_160_Align2RegClass;
2720 if (BitWidth == 192)
2721 return &AMDGPU::AReg_192_Align2RegClass;
2722 if (BitWidth == 224)
2723 return &AMDGPU::AReg_224_Align2RegClass;
2724 if (BitWidth == 256)
2725 return &AMDGPU::AReg_256_Align2RegClass;
2726 if (BitWidth == 288)
2727 return &AMDGPU::AReg_288_Align2RegClass;
2728 if (BitWidth == 320)
2729 return &AMDGPU::AReg_320_Align2RegClass;
2730 if (BitWidth == 352)
2731 return &AMDGPU::AReg_352_Align2RegClass;
2732 if (BitWidth == 384)
2733 return &AMDGPU::AReg_384_Align2RegClass;
2734 if (BitWidth == 512)
2735 return &AMDGPU::AReg_512_Align2RegClass;
2736 if (BitWidth == 1024)
2737 return &AMDGPU::AReg_1024_Align2RegClass;
2738
2739 return nullptr;
2740 }
2741
2742 const TargetRegisterClass *
getAGPRClassForBitWidth(unsigned BitWidth) const2743 SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
2744 if (BitWidth == 16)
2745 return &AMDGPU::AGPR_LO16RegClass;
2746 if (BitWidth == 32)
2747 return &AMDGPU::AGPR_32RegClass;
2748 return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
2749 : getAnyAGPRClassForBitWidth(BitWidth);
2750 }
2751
2752 static const TargetRegisterClass *
getAnyVectorSuperClassForBitWidth(unsigned BitWidth)2753 getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
2754 if (BitWidth == 64)
2755 return &AMDGPU::AV_64RegClass;
2756 if (BitWidth == 96)
2757 return &AMDGPU::AV_96RegClass;
2758 if (BitWidth == 128)
2759 return &AMDGPU::AV_128RegClass;
2760 if (BitWidth == 160)
2761 return &AMDGPU::AV_160RegClass;
2762 if (BitWidth == 192)
2763 return &AMDGPU::AV_192RegClass;
2764 if (BitWidth == 224)
2765 return &AMDGPU::AV_224RegClass;
2766 if (BitWidth == 256)
2767 return &AMDGPU::AV_256RegClass;
2768 if (BitWidth == 288)
2769 return &AMDGPU::AV_288RegClass;
2770 if (BitWidth == 320)
2771 return &AMDGPU::AV_320RegClass;
2772 if (BitWidth == 352)
2773 return &AMDGPU::AV_352RegClass;
2774 if (BitWidth == 384)
2775 return &AMDGPU::AV_384RegClass;
2776 if (BitWidth == 512)
2777 return &AMDGPU::AV_512RegClass;
2778 if (BitWidth == 1024)
2779 return &AMDGPU::AV_1024RegClass;
2780
2781 return nullptr;
2782 }
2783
2784 static const TargetRegisterClass *
getAlignedVectorSuperClassForBitWidth(unsigned BitWidth)2785 getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
2786 if (BitWidth == 64)
2787 return &AMDGPU::AV_64_Align2RegClass;
2788 if (BitWidth == 96)
2789 return &AMDGPU::AV_96_Align2RegClass;
2790 if (BitWidth == 128)
2791 return &AMDGPU::AV_128_Align2RegClass;
2792 if (BitWidth == 160)
2793 return &AMDGPU::AV_160_Align2RegClass;
2794 if (BitWidth == 192)
2795 return &AMDGPU::AV_192_Align2RegClass;
2796 if (BitWidth == 224)
2797 return &AMDGPU::AV_224_Align2RegClass;
2798 if (BitWidth == 256)
2799 return &AMDGPU::AV_256_Align2RegClass;
2800 if (BitWidth == 288)
2801 return &AMDGPU::AV_288_Align2RegClass;
2802 if (BitWidth == 320)
2803 return &AMDGPU::AV_320_Align2RegClass;
2804 if (BitWidth == 352)
2805 return &AMDGPU::AV_352_Align2RegClass;
2806 if (BitWidth == 384)
2807 return &AMDGPU::AV_384_Align2RegClass;
2808 if (BitWidth == 512)
2809 return &AMDGPU::AV_512_Align2RegClass;
2810 if (BitWidth == 1024)
2811 return &AMDGPU::AV_1024_Align2RegClass;
2812
2813 return nullptr;
2814 }
2815
2816 const TargetRegisterClass *
getVectorSuperClassForBitWidth(unsigned BitWidth) const2817 SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
2818 if (BitWidth == 32)
2819 return &AMDGPU::AV_32RegClass;
2820 return ST.needsAlignedVGPRs()
2821 ? getAlignedVectorSuperClassForBitWidth(BitWidth)
2822 : getAnyVectorSuperClassForBitWidth(BitWidth);
2823 }
2824
2825 const TargetRegisterClass *
getSGPRClassForBitWidth(unsigned BitWidth)2826 SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
2827 if (BitWidth == 16)
2828 return &AMDGPU::SGPR_LO16RegClass;
2829 if (BitWidth == 32)
2830 return &AMDGPU::SReg_32RegClass;
2831 if (BitWidth == 64)
2832 return &AMDGPU::SReg_64RegClass;
2833 if (BitWidth == 96)
2834 return &AMDGPU::SGPR_96RegClass;
2835 if (BitWidth == 128)
2836 return &AMDGPU::SGPR_128RegClass;
2837 if (BitWidth == 160)
2838 return &AMDGPU::SGPR_160RegClass;
2839 if (BitWidth == 192)
2840 return &AMDGPU::SGPR_192RegClass;
2841 if (BitWidth == 224)
2842 return &AMDGPU::SGPR_224RegClass;
2843 if (BitWidth == 256)
2844 return &AMDGPU::SGPR_256RegClass;
2845 if (BitWidth == 288)
2846 return &AMDGPU::SGPR_288RegClass;
2847 if (BitWidth == 320)
2848 return &AMDGPU::SGPR_320RegClass;
2849 if (BitWidth == 352)
2850 return &AMDGPU::SGPR_352RegClass;
2851 if (BitWidth == 384)
2852 return &AMDGPU::SGPR_384RegClass;
2853 if (BitWidth == 512)
2854 return &AMDGPU::SGPR_512RegClass;
2855 if (BitWidth == 1024)
2856 return &AMDGPU::SGPR_1024RegClass;
2857
2858 return nullptr;
2859 }
2860
isSGPRReg(const MachineRegisterInfo & MRI,Register Reg) const2861 bool SIRegisterInfo::isSGPRReg(const MachineRegisterInfo &MRI,
2862 Register Reg) const {
2863 const TargetRegisterClass *RC;
2864 if (Reg.isVirtual())
2865 RC = MRI.getRegClass(Reg);
2866 else
2867 RC = getPhysRegBaseClass(Reg);
2868 return RC ? isSGPRClass(RC) : false;
2869 }
2870
2871 const TargetRegisterClass *
getEquivalentVGPRClass(const TargetRegisterClass * SRC) const2872 SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
2873 unsigned Size = getRegSizeInBits(*SRC);
2874 const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
2875 assert(VRC && "Invalid register class size");
2876 return VRC;
2877 }
2878
2879 const TargetRegisterClass *
getEquivalentAGPRClass(const TargetRegisterClass * SRC) const2880 SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
2881 unsigned Size = getRegSizeInBits(*SRC);
2882 const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
2883 assert(ARC && "Invalid register class size");
2884 return ARC;
2885 }
2886
2887 const TargetRegisterClass *
getEquivalentSGPRClass(const TargetRegisterClass * VRC) const2888 SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
2889 unsigned Size = getRegSizeInBits(*VRC);
2890 if (Size == 32)
2891 return &AMDGPU::SGPR_32RegClass;
2892 const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
2893 assert(SRC && "Invalid register class size");
2894 return SRC;
2895 }
2896
2897 const TargetRegisterClass *
getCompatibleSubRegClass(const TargetRegisterClass * SuperRC,const TargetRegisterClass * SubRC,unsigned SubIdx) const2898 SIRegisterInfo::getCompatibleSubRegClass(const TargetRegisterClass *SuperRC,
2899 const TargetRegisterClass *SubRC,
2900 unsigned SubIdx) const {
2901 // Ensure this subregister index is aligned in the super register.
2902 const TargetRegisterClass *MatchRC =
2903 getMatchingSuperRegClass(SuperRC, SubRC, SubIdx);
2904 return MatchRC && MatchRC->hasSubClassEq(SuperRC) ? MatchRC : nullptr;
2905 }
2906
opCanUseInlineConstant(unsigned OpType) const2907 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
2908 if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
2909 OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
2910 return !ST.hasMFMAInlineLiteralBug();
2911
2912 return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
2913 OpType <= AMDGPU::OPERAND_SRC_LAST;
2914 }
2915
shouldRewriteCopySrc(const TargetRegisterClass * DefRC,unsigned DefSubReg,const TargetRegisterClass * SrcRC,unsigned SrcSubReg) const2916 bool SIRegisterInfo::shouldRewriteCopySrc(
2917 const TargetRegisterClass *DefRC,
2918 unsigned DefSubReg,
2919 const TargetRegisterClass *SrcRC,
2920 unsigned SrcSubReg) const {
2921 // We want to prefer the smallest register class possible, so we don't want to
2922 // stop and rewrite on anything that looks like a subregister
2923 // extract. Operations mostly don't care about the super register class, so we
2924 // only want to stop on the most basic of copies between the same register
2925 // class.
2926 //
2927 // e.g. if we have something like
2928 // %0 = ...
2929 // %1 = ...
2930 // %2 = REG_SEQUENCE %0, sub0, %1, sub1, %2, sub2
2931 // %3 = COPY %2, sub0
2932 //
2933 // We want to look through the COPY to find:
2934 // => %3 = COPY %0
2935
2936 // Plain copy.
2937 return getCommonSubClass(DefRC, SrcRC) != nullptr;
2938 }
2939
opCanUseLiteralConstant(unsigned OpType) const2940 bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
2941 // TODO: 64-bit operands have extending behavior from 32-bit literal.
2942 return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
2943 OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
2944 }
2945
2946 /// Returns a lowest register that is not used at any point in the function.
2947 /// If all registers are used, then this function will return
2948 /// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
2949 /// highest unused register.
findUnusedRegister(const MachineRegisterInfo & MRI,const TargetRegisterClass * RC,const MachineFunction & MF,bool ReserveHighestRegister) const2950 MCRegister SIRegisterInfo::findUnusedRegister(
2951 const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
2952 const MachineFunction &MF, bool ReserveHighestRegister) const {
2953 if (ReserveHighestRegister) {
2954 for (MCRegister Reg : reverse(*RC))
2955 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2956 return Reg;
2957 } else {
2958 for (MCRegister Reg : *RC)
2959 if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
2960 return Reg;
2961 }
2962 return MCRegister();
2963 }
2964
isUniformReg(const MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register Reg) const2965 bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
2966 const RegisterBankInfo &RBI,
2967 Register Reg) const {
2968 auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
2969 if (!RB)
2970 return false;
2971
2972 return !RBI.isDivergentRegBank(RB);
2973 }
2974
getRegSplitParts(const TargetRegisterClass * RC,unsigned EltSize) const2975 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
2976 unsigned EltSize) const {
2977 const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
2978 assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
2979
2980 const unsigned RegDWORDs = RegBitWidth / 32;
2981 const unsigned EltDWORDs = EltSize / 4;
2982 assert(RegSplitParts.size() + 1 >= EltDWORDs);
2983
2984 const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
2985 const unsigned NumParts = RegDWORDs / EltDWORDs;
2986
2987 return ArrayRef(Parts.data(), NumParts);
2988 }
2989
2990 const TargetRegisterClass*
getRegClassForReg(const MachineRegisterInfo & MRI,Register Reg) const2991 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
2992 Register Reg) const {
2993 return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegBaseClass(Reg);
2994 }
2995
2996 const TargetRegisterClass *
getRegClassForOperandReg(const MachineRegisterInfo & MRI,const MachineOperand & MO) const2997 SIRegisterInfo::getRegClassForOperandReg(const MachineRegisterInfo &MRI,
2998 const MachineOperand &MO) const {
2999 const TargetRegisterClass *SrcRC = getRegClassForReg(MRI, MO.getReg());
3000 return getSubRegisterClass(SrcRC, MO.getSubReg());
3001 }
3002
isVGPR(const MachineRegisterInfo & MRI,Register Reg) const3003 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
3004 Register Reg) const {
3005 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3006 // Registers without classes are unaddressable, SGPR-like registers.
3007 return RC && isVGPRClass(RC);
3008 }
3009
isAGPR(const MachineRegisterInfo & MRI,Register Reg) const3010 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
3011 Register Reg) const {
3012 const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
3013
3014 // Registers without classes are unaddressable, SGPR-like registers.
3015 return RC && isAGPRClass(RC);
3016 }
3017
shouldCoalesce(MachineInstr * MI,const TargetRegisterClass * SrcRC,unsigned SubReg,const TargetRegisterClass * DstRC,unsigned DstSubReg,const TargetRegisterClass * NewRC,LiveIntervals & LIS) const3018 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
3019 const TargetRegisterClass *SrcRC,
3020 unsigned SubReg,
3021 const TargetRegisterClass *DstRC,
3022 unsigned DstSubReg,
3023 const TargetRegisterClass *NewRC,
3024 LiveIntervals &LIS) const {
3025 unsigned SrcSize = getRegSizeInBits(*SrcRC);
3026 unsigned DstSize = getRegSizeInBits(*DstRC);
3027 unsigned NewSize = getRegSizeInBits(*NewRC);
3028
3029 // Do not increase size of registers beyond dword, we would need to allocate
3030 // adjacent registers and constraint regalloc more than needed.
3031
3032 // Always allow dword coalescing.
3033 if (SrcSize <= 32 || DstSize <= 32)
3034 return true;
3035
3036 return NewSize <= DstSize || NewSize <= SrcSize;
3037 }
3038
getRegPressureLimit(const TargetRegisterClass * RC,MachineFunction & MF) const3039 unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
3040 MachineFunction &MF) const {
3041 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
3042
3043 unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
3044 MF.getFunction());
3045 switch (RC->getID()) {
3046 default:
3047 return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
3048 case AMDGPU::VGPR_32RegClassID:
3049 return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
3050 case AMDGPU::SGPR_32RegClassID:
3051 case AMDGPU::SGPR_LO16RegClassID:
3052 return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
3053 }
3054 }
3055
getRegPressureSetLimit(const MachineFunction & MF,unsigned Idx) const3056 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
3057 unsigned Idx) const {
3058 if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
3059 Idx == AMDGPU::RegisterPressureSets::AGPR_32)
3060 return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
3061 const_cast<MachineFunction &>(MF));
3062
3063 if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
3064 return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
3065 const_cast<MachineFunction &>(MF));
3066
3067 llvm_unreachable("Unexpected register pressure set!");
3068 }
3069
getRegUnitPressureSets(unsigned RegUnit) const3070 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
3071 static const int Empty[] = { -1 };
3072
3073 if (RegPressureIgnoredUnits[RegUnit])
3074 return Empty;
3075
3076 return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
3077 }
3078
getReturnAddressReg(const MachineFunction & MF) const3079 MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
3080 // Not a callee saved register.
3081 return AMDGPU::SGPR30_SGPR31;
3082 }
3083
3084 const TargetRegisterClass *
getRegClassForSizeOnBank(unsigned Size,const RegisterBank & RB) const3085 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
3086 const RegisterBank &RB) const {
3087 switch (RB.getID()) {
3088 case AMDGPU::VGPRRegBankID:
3089 return getVGPRClassForBitWidth(
3090 std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
3091 case AMDGPU::VCCRegBankID:
3092 assert(Size == 1);
3093 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3094 : &AMDGPU::SReg_64_XEXECRegClass;
3095 case AMDGPU::SGPRRegBankID:
3096 return getSGPRClassForBitWidth(std::max(32u, Size));
3097 case AMDGPU::AGPRRegBankID:
3098 return getAGPRClassForBitWidth(std::max(32u, Size));
3099 default:
3100 llvm_unreachable("unknown register bank");
3101 }
3102 }
3103
3104 const TargetRegisterClass *
getConstrainedRegClassForOperand(const MachineOperand & MO,const MachineRegisterInfo & MRI) const3105 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
3106 const MachineRegisterInfo &MRI) const {
3107 const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3108 if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
3109 return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
3110
3111 if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
3112 return getAllocatableClass(RC);
3113
3114 return nullptr;
3115 }
3116
getVCC() const3117 MCRegister SIRegisterInfo::getVCC() const {
3118 return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
3119 }
3120
getExec() const3121 MCRegister SIRegisterInfo::getExec() const {
3122 return isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
3123 }
3124
getVGPR64Class() const3125 const TargetRegisterClass *SIRegisterInfo::getVGPR64Class() const {
3126 // VGPR tuples have an alignment requirement on gfx90a variants.
3127 return ST.needsAlignedVGPRs() ? &AMDGPU::VReg_64_Align2RegClass
3128 : &AMDGPU::VReg_64RegClass;
3129 }
3130
3131 const TargetRegisterClass *
getRegClass(unsigned RCID) const3132 SIRegisterInfo::getRegClass(unsigned RCID) const {
3133 switch ((int)RCID) {
3134 case AMDGPU::SReg_1RegClassID:
3135 return getBoolRC();
3136 case AMDGPU::SReg_1_XEXECRegClassID:
3137 return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
3138 : &AMDGPU::SReg_64_XEXECRegClass;
3139 case -1:
3140 return nullptr;
3141 default:
3142 return AMDGPUGenRegisterInfo::getRegClass(RCID);
3143 }
3144 }
3145
3146 // Find reaching register definition
findReachingDef(Register Reg,unsigned SubReg,MachineInstr & Use,MachineRegisterInfo & MRI,LiveIntervals * LIS) const3147 MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
3148 MachineInstr &Use,
3149 MachineRegisterInfo &MRI,
3150 LiveIntervals *LIS) const {
3151 auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
3152 SlotIndex UseIdx = LIS->getInstructionIndex(Use);
3153 SlotIndex DefIdx;
3154
3155 if (Reg.isVirtual()) {
3156 if (!LIS->hasInterval(Reg))
3157 return nullptr;
3158 LiveInterval &LI = LIS->getInterval(Reg);
3159 LaneBitmask SubLanes = SubReg ? getSubRegIndexLaneMask(SubReg)
3160 : MRI.getMaxLaneMaskForVReg(Reg);
3161 VNInfo *V = nullptr;
3162 if (LI.hasSubRanges()) {
3163 for (auto &S : LI.subranges()) {
3164 if ((S.LaneMask & SubLanes) == SubLanes) {
3165 V = S.getVNInfoAt(UseIdx);
3166 break;
3167 }
3168 }
3169 } else {
3170 V = LI.getVNInfoAt(UseIdx);
3171 }
3172 if (!V)
3173 return nullptr;
3174 DefIdx = V->def;
3175 } else {
3176 // Find last def.
3177 for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
3178 LiveRange &LR = LIS->getRegUnit(Unit);
3179 if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
3180 if (!DefIdx.isValid() ||
3181 MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
3182 LIS->getInstructionFromIndex(V->def)))
3183 DefIdx = V->def;
3184 } else {
3185 return nullptr;
3186 }
3187 }
3188 }
3189
3190 MachineInstr *Def = LIS->getInstructionFromIndex(DefIdx);
3191
3192 if (!Def || !MDT.dominates(Def, &Use))
3193 return nullptr;
3194
3195 assert(Def->modifiesRegister(Reg, this));
3196
3197 return Def;
3198 }
3199
get32BitRegister(MCPhysReg Reg) const3200 MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
3201 assert(getRegSizeInBits(*getPhysRegBaseClass(Reg)) <= 32);
3202
3203 for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
3204 AMDGPU::SReg_32RegClass,
3205 AMDGPU::AGPR_32RegClass } ) {
3206 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
3207 return Super;
3208 }
3209 if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
3210 &AMDGPU::VGPR_32RegClass)) {
3211 return Super;
3212 }
3213
3214 return AMDGPU::NoRegister;
3215 }
3216
isProperlyAlignedRC(const TargetRegisterClass & RC) const3217 bool SIRegisterInfo::isProperlyAlignedRC(const TargetRegisterClass &RC) const {
3218 if (!ST.needsAlignedVGPRs())
3219 return true;
3220
3221 if (isVGPRClass(&RC))
3222 return RC.hasSuperClassEq(getVGPRClassForBitWidth(getRegSizeInBits(RC)));
3223 if (isAGPRClass(&RC))
3224 return RC.hasSuperClassEq(getAGPRClassForBitWidth(getRegSizeInBits(RC)));
3225 if (isVectorSuperClass(&RC))
3226 return RC.hasSuperClassEq(
3227 getVectorSuperClassForBitWidth(getRegSizeInBits(RC)));
3228
3229 return true;
3230 }
3231
3232 const TargetRegisterClass *
getProperlyAlignedRC(const TargetRegisterClass * RC) const3233 SIRegisterInfo::getProperlyAlignedRC(const TargetRegisterClass *RC) const {
3234 if (!RC || !ST.needsAlignedVGPRs())
3235 return RC;
3236
3237 unsigned Size = getRegSizeInBits(*RC);
3238 if (Size <= 32)
3239 return RC;
3240
3241 if (isVGPRClass(RC))
3242 return getAlignedVGPRClassForBitWidth(Size);
3243 if (isAGPRClass(RC))
3244 return getAlignedAGPRClassForBitWidth(Size);
3245 if (isVectorSuperClass(RC))
3246 return getAlignedVectorSuperClassForBitWidth(Size);
3247
3248 return RC;
3249 }
3250
3251 ArrayRef<MCPhysReg>
getAllSGPR128(const MachineFunction & MF) const3252 SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
3253 return ArrayRef(AMDGPU::SGPR_128RegClass.begin(), ST.getMaxNumSGPRs(MF) / 4);
3254 }
3255
3256 ArrayRef<MCPhysReg>
getAllSGPR64(const MachineFunction & MF) const3257 SIRegisterInfo::getAllSGPR64(const MachineFunction &MF) const {
3258 return ArrayRef(AMDGPU::SGPR_64RegClass.begin(), ST.getMaxNumSGPRs(MF) / 2);
3259 }
3260
3261 ArrayRef<MCPhysReg>
getAllSGPR32(const MachineFunction & MF) const3262 SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
3263 return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
3264 }
3265
3266 unsigned
getSubRegAlignmentNumBits(const TargetRegisterClass * RC,unsigned SubReg) const3267 SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
3268 unsigned SubReg) const {
3269 switch (RC->TSFlags & SIRCFlags::RegKindMask) {
3270 case SIRCFlags::HasSGPR:
3271 return std::min(128u, getSubRegIdxSize(SubReg));
3272 case SIRCFlags::HasAGPR:
3273 case SIRCFlags::HasVGPR:
3274 case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
3275 return std::min(32u, getSubRegIdxSize(SubReg));
3276 default:
3277 break;
3278 }
3279 return 0;
3280 }
3281