1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA unstructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will relace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/ADT/Statistic.h"
20 #include "llvm/CodeGen/LiveIntervals.h"
21 #include "llvm/CodeGen/LiveRegMatrix.h"
22 #include "llvm/CodeGen/MachineFunctionPass.h"
23 #include "llvm/InitializePasses.h"
24
25 using namespace llvm;
26
27 #define DEBUG_TYPE "amdgpu-nsa-reassign"
28
29 STATISTIC(NumNSAInstructions,
30 "Number of NSA instructions with non-sequential address found");
31 STATISTIC(NumNSAConverted,
32 "Number of NSA instructions changed to sequential");
33
34 namespace {
35
36 class GCNNSAReassign : public MachineFunctionPass {
37 public:
38 static char ID;
39
GCNNSAReassign()40 GCNNSAReassign() : MachineFunctionPass(ID) {
41 initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
42 }
43
44 bool runOnMachineFunction(MachineFunction &MF) override;
45
getPassName() const46 StringRef getPassName() const override { return "GCN NSA Reassign"; }
47
getAnalysisUsage(AnalysisUsage & AU) const48 void getAnalysisUsage(AnalysisUsage &AU) const override {
49 AU.addRequired<LiveIntervals>();
50 AU.addRequired<VirtRegMap>();
51 AU.addRequired<LiveRegMatrix>();
52 AU.setPreservesAll();
53 MachineFunctionPass::getAnalysisUsage(AU);
54 }
55
56 private:
57 typedef enum {
58 NOT_NSA, // Not an NSA instruction
59 FIXED, // NSA which we cannot modify
60 NON_CONTIGUOUS, // NSA with non-sequential address which we can try
61 // to optimize.
62 CONTIGUOUS // NSA with all sequential address registers
63 } NSA_Status;
64
65 const GCNSubtarget *ST;
66
67 const MachineRegisterInfo *MRI;
68
69 const SIRegisterInfo *TRI;
70
71 VirtRegMap *VRM;
72
73 LiveRegMatrix *LRM;
74
75 LiveIntervals *LIS;
76
77 unsigned MaxNumVGPRs;
78
79 const MCPhysReg *CSRegs;
80
81 NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
82
83 bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
84 unsigned StartReg) const;
85
86 bool canAssign(unsigned StartReg, unsigned NumRegs) const;
87
88 bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
89 };
90
91 } // End anonymous namespace.
92
93 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
94 false, false)
95 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
96 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
97 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
98 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
99 false, false)
100
101
102 char GCNNSAReassign::ID = 0;
103
104 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
105
106 bool
tryAssignRegisters(SmallVectorImpl<LiveInterval * > & Intervals,unsigned StartReg) const107 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
108 unsigned StartReg) const {
109 unsigned NumRegs = Intervals.size();
110
111 for (unsigned N = 0; N < NumRegs; ++N)
112 if (VRM->hasPhys(Intervals[N]->reg()))
113 LRM->unassign(*Intervals[N]);
114
115 for (unsigned N = 0; N < NumRegs; ++N)
116 if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
117 return false;
118
119 for (unsigned N = 0; N < NumRegs; ++N)
120 LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
121
122 return true;
123 }
124
canAssign(unsigned StartReg,unsigned NumRegs) const125 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
126 for (unsigned N = 0; N < NumRegs; ++N) {
127 unsigned Reg = StartReg + N;
128 if (!MRI->isAllocatable(Reg))
129 return false;
130
131 for (unsigned I = 0; CSRegs[I]; ++I)
132 if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
133 !LRM->isPhysRegUsed(CSRegs[I]))
134 return false;
135 }
136
137 return true;
138 }
139
140 bool
scavengeRegs(SmallVectorImpl<LiveInterval * > & Intervals) const141 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
142 unsigned NumRegs = Intervals.size();
143
144 if (NumRegs > MaxNumVGPRs)
145 return false;
146 unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
147
148 for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
149 if (!canAssign(Reg, NumRegs))
150 continue;
151
152 if (tryAssignRegisters(Intervals, Reg))
153 return true;
154 }
155
156 return false;
157 }
158
159 GCNNSAReassign::NSA_Status
CheckNSA(const MachineInstr & MI,bool Fast) const160 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
161 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
162 if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
163 return NSA_Status::NOT_NSA;
164
165 int VAddr0Idx =
166 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
167
168 unsigned VgprBase = 0;
169 bool NSA = false;
170 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
171 const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
172 Register Reg = Op.getReg();
173 if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
174 return NSA_Status::FIXED;
175
176 Register PhysReg = VRM->getPhys(Reg);
177
178 if (!Fast) {
179 if (!PhysReg)
180 return NSA_Status::FIXED;
181
182 // Bail if address is not a VGPR32. That should be possible to extend the
183 // optimization to work with subregs of a wider register tuples, but the
184 // logic to find free registers will be much more complicated with much
185 // less chances for success. That seems reasonable to assume that in most
186 // cases a tuple is used because a vector variable contains different
187 // parts of an address and it is either already consequitive or cannot
188 // be reassigned if not. If needed it is better to rely on register
189 // coalescer to process such address tuples.
190 if (MRI->getRegClass(Reg) != &AMDGPU::VGPR_32RegClass || Op.getSubReg())
191 return NSA_Status::FIXED;
192
193 // InlineSpiller does not call LRM::assign() after an LI split leaving
194 // it in an inconsistent state, so we cannot call LRM::unassign().
195 // See llvm bug #48911.
196 // Skip reassign if a register has originated from such split.
197 // FIXME: Remove the workaround when bug #48911 is fixed.
198 if (VRM->getPreSplitReg(Reg))
199 return NSA_Status::FIXED;
200
201 const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
202
203 if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
204 return NSA_Status::FIXED;
205
206 for (auto U : MRI->use_nodbg_operands(Reg)) {
207 if (U.isImplicit())
208 return NSA_Status::FIXED;
209 const MachineInstr *UseInst = U.getParent();
210 if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
211 return NSA_Status::FIXED;
212 }
213
214 if (!LIS->hasInterval(Reg))
215 return NSA_Status::FIXED;
216 }
217
218 if (I == 0)
219 VgprBase = PhysReg;
220 else if (VgprBase + I != PhysReg)
221 NSA = true;
222 }
223
224 return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
225 }
226
runOnMachineFunction(MachineFunction & MF)227 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
228 ST = &MF.getSubtarget<GCNSubtarget>();
229 if (ST->getGeneration() < GCNSubtarget::GFX10)
230 return false;
231
232 MRI = &MF.getRegInfo();
233 TRI = ST->getRegisterInfo();
234 VRM = &getAnalysis<VirtRegMap>();
235 LRM = &getAnalysis<LiveRegMatrix>();
236 LIS = &getAnalysis<LiveIntervals>();
237
238 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
239 MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
240 MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
241 CSRegs = MRI->getCalleeSavedRegs();
242
243 using Candidate = std::pair<const MachineInstr*, bool>;
244 SmallVector<Candidate, 32> Candidates;
245 for (const MachineBasicBlock &MBB : MF) {
246 for (const MachineInstr &MI : MBB) {
247 switch (CheckNSA(MI)) {
248 default:
249 continue;
250 case NSA_Status::CONTIGUOUS:
251 Candidates.push_back(std::make_pair(&MI, true));
252 break;
253 case NSA_Status::NON_CONTIGUOUS:
254 Candidates.push_back(std::make_pair(&MI, false));
255 ++NumNSAInstructions;
256 break;
257 }
258 }
259 }
260
261 bool Changed = false;
262 for (auto &C : Candidates) {
263 if (C.second)
264 continue;
265
266 const MachineInstr *MI = C.first;
267 if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
268 // Already happen to be fixed.
269 C.second = true;
270 ++NumNSAConverted;
271 continue;
272 }
273
274 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
275 int VAddr0Idx =
276 AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
277
278 SmallVector<LiveInterval *, 16> Intervals;
279 SmallVector<MCRegister, 16> OrigRegs;
280 SlotIndex MinInd, MaxInd;
281 for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
282 const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
283 Register Reg = Op.getReg();
284 LiveInterval *LI = &LIS->getInterval(Reg);
285 if (llvm::is_contained(Intervals, LI)) {
286 // Same register used, unable to make sequential
287 Intervals.clear();
288 break;
289 }
290 Intervals.push_back(LI);
291 OrigRegs.push_back(VRM->getPhys(Reg));
292 if (LI->empty()) {
293 // The address input is undef, so it doesn't contribute to the relevant
294 // range. Seed a reasonable index range if required.
295 if (I == 0)
296 MinInd = MaxInd = LIS->getInstructionIndex(*MI);
297 continue;
298 }
299 MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
300 MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
301 }
302
303 if (Intervals.empty())
304 continue;
305
306 LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
307 << "\tOriginal allocation:\t";
308 for (auto *LI
309 : Intervals) dbgs()
310 << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
311 dbgs() << '\n');
312
313 bool Success = scavengeRegs(Intervals);
314 if (!Success) {
315 LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
316 if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
317 continue;
318 } else {
319 // Check we did not make it worse for other instructions.
320 auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
321 [this](const Candidate &C, SlotIndex I) {
322 return LIS->getInstructionIndex(*C.first) < I;
323 });
324 for (auto E = Candidates.end(); Success && I != E &&
325 LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
326 if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
327 Success = false;
328 LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
329 }
330 }
331 }
332
333 if (!Success) {
334 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
335 if (VRM->hasPhys(Intervals[I]->reg()))
336 LRM->unassign(*Intervals[I]);
337
338 for (unsigned I = 0; I < Info->VAddrDwords; ++I)
339 LRM->assign(*Intervals[I], OrigRegs[I]);
340
341 continue;
342 }
343
344 C.second = true;
345 ++NumNSAConverted;
346 LLVM_DEBUG(
347 dbgs() << "\tNew allocation:\t\t ["
348 << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
349 << " : "
350 << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
351 << "]\n");
352 Changed = true;
353 }
354
355 return Changed;
356 }
357