1 //===-- GCNNSAReassign.cpp - Reassign registers in NSA instructions -------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// \brief Try to reassign registers on GFX10+ from non-sequential to sequential
11 /// in NSA image instructions. Later SIShrinkInstructions pass will replace NSA
12 /// with sequential versions where possible.
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "GCNSubtarget.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "SIRegisterInfo.h"
20 #include "llvm/ADT/Statistic.h"
21 #include "llvm/CodeGen/LiveIntervals.h"
22 #include "llvm/CodeGen/LiveRegMatrix.h"
23 #include "llvm/CodeGen/MachineFunctionPass.h"
24 #include "llvm/CodeGen/VirtRegMap.h"
25 #include "llvm/InitializePasses.h"
26 
27 using namespace llvm;
28 
29 #define DEBUG_TYPE "amdgpu-nsa-reassign"
30 
31 STATISTIC(NumNSAInstructions,
32           "Number of NSA instructions with non-sequential address found");
33 STATISTIC(NumNSAConverted,
34           "Number of NSA instructions changed to sequential");
35 
36 namespace {
37 
38 class GCNNSAReassign : public MachineFunctionPass {
39 public:
40   static char ID;
41 
42   GCNNSAReassign() : MachineFunctionPass(ID) {
43     initializeGCNNSAReassignPass(*PassRegistry::getPassRegistry());
44   }
45 
46   bool runOnMachineFunction(MachineFunction &MF) override;
47 
48   StringRef getPassName() const override { return "GCN NSA Reassign"; }
49 
50   void getAnalysisUsage(AnalysisUsage &AU) const override {
51     AU.addRequired<LiveIntervals>();
52     AU.addRequired<VirtRegMap>();
53     AU.addRequired<LiveRegMatrix>();
54     AU.setPreservesAll();
55     MachineFunctionPass::getAnalysisUsage(AU);
56   }
57 
58 private:
59   typedef enum {
60     NOT_NSA,        // Not an NSA instruction
61     FIXED,          // NSA which we cannot modify
62     NON_CONTIGUOUS, // NSA with non-sequential address which we can try
63                     // to optimize.
64     CONTIGUOUS      // NSA with all sequential address registers
65   } NSA_Status;
66 
67   const GCNSubtarget *ST;
68 
69   const MachineRegisterInfo *MRI;
70 
71   const SIRegisterInfo *TRI;
72 
73   VirtRegMap *VRM;
74 
75   LiveRegMatrix *LRM;
76 
77   LiveIntervals *LIS;
78 
79   unsigned MaxNumVGPRs;
80 
81   const MCPhysReg *CSRegs;
82 
83   NSA_Status CheckNSA(const MachineInstr &MI, bool Fast = false) const;
84 
85   bool tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
86                           unsigned StartReg) const;
87 
88   bool canAssign(unsigned StartReg, unsigned NumRegs) const;
89 
90   bool scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const;
91 };
92 
93 } // End anonymous namespace.
94 
95 INITIALIZE_PASS_BEGIN(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
96                       false, false)
97 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
98 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
99 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
100 INITIALIZE_PASS_END(GCNNSAReassign, DEBUG_TYPE, "GCN NSA Reassign",
101                     false, false)
102 
103 
104 char GCNNSAReassign::ID = 0;
105 
106 char &llvm::GCNNSAReassignID = GCNNSAReassign::ID;
107 
108 bool
109 GCNNSAReassign::tryAssignRegisters(SmallVectorImpl<LiveInterval *> &Intervals,
110                                    unsigned StartReg) const {
111   unsigned NumRegs = Intervals.size();
112 
113   for (unsigned N = 0; N < NumRegs; ++N)
114     if (VRM->hasPhys(Intervals[N]->reg()))
115       LRM->unassign(*Intervals[N]);
116 
117   for (unsigned N = 0; N < NumRegs; ++N)
118     if (LRM->checkInterference(*Intervals[N], MCRegister::from(StartReg + N)))
119       return false;
120 
121   for (unsigned N = 0; N < NumRegs; ++N)
122     LRM->assign(*Intervals[N], MCRegister::from(StartReg + N));
123 
124   return true;
125 }
126 
127 bool GCNNSAReassign::canAssign(unsigned StartReg, unsigned NumRegs) const {
128   for (unsigned N = 0; N < NumRegs; ++N) {
129     unsigned Reg = StartReg + N;
130     if (!MRI->isAllocatable(Reg))
131       return false;
132 
133     for (unsigned I = 0; CSRegs[I]; ++I)
134       if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
135           !LRM->isPhysRegUsed(CSRegs[I]))
136       return false;
137   }
138 
139   return true;
140 }
141 
142 bool
143 GCNNSAReassign::scavengeRegs(SmallVectorImpl<LiveInterval *> &Intervals) const {
144   unsigned NumRegs = Intervals.size();
145 
146   if (NumRegs > MaxNumVGPRs)
147     return false;
148   unsigned MaxReg = MaxNumVGPRs - NumRegs + AMDGPU::VGPR0;
149 
150   for (unsigned Reg = AMDGPU::VGPR0; Reg <= MaxReg; ++Reg) {
151     if (!canAssign(Reg, NumRegs))
152       continue;
153 
154     if (tryAssignRegisters(Intervals, Reg))
155       return true;
156   }
157 
158   return false;
159 }
160 
161 GCNNSAReassign::NSA_Status
162 GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
163   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
164   if (!Info)
165     return NSA_Status::NOT_NSA;
166 
167   switch (Info->MIMGEncoding) {
168   case AMDGPU::MIMGEncGfx10NSA:
169   case AMDGPU::MIMGEncGfx11NSA:
170     break;
171   default:
172     return NSA_Status::NOT_NSA;
173   }
174 
175   int VAddr0Idx =
176     AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
177 
178   unsigned VgprBase = 0;
179   bool NSA = false;
180   for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
181     const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
182     Register Reg = Op.getReg();
183     if (Reg.isPhysical() || !VRM->isAssignedReg(Reg))
184       return NSA_Status::FIXED;
185 
186     Register PhysReg = VRM->getPhys(Reg);
187 
188     if (!Fast) {
189       if (!PhysReg)
190         return NSA_Status::FIXED;
191 
192       // TODO: address the below limitation to handle GFX11 BVH instructions
193       // Bail if address is not a VGPR32. That should be possible to extend the
194       // optimization to work with subregs of a wider register tuples, but the
195       // logic to find free registers will be much more complicated with much
196       // less chances for success. That seems reasonable to assume that in most
197       // cases a tuple is used because a vector variable contains different
198       // parts of an address and it is either already consecutive or cannot
199       // be reassigned if not. If needed it is better to rely on register
200       // coalescer to process such address tuples.
201       if (TRI->getRegSizeInBits(*MRI->getRegClass(Reg)) != 32 || Op.getSubReg())
202         return NSA_Status::FIXED;
203 
204       // InlineSpiller does not call LRM::assign() after an LI split leaving
205       // it in an inconsistent state, so we cannot call LRM::unassign().
206       // See llvm bug #48911.
207       // Skip reassign if a register has originated from such split.
208       // FIXME: Remove the workaround when bug #48911 is fixed.
209       if (VRM->getPreSplitReg(Reg))
210         return NSA_Status::FIXED;
211 
212       const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
213 
214       if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
215         return NSA_Status::FIXED;
216 
217       for (auto U : MRI->use_nodbg_operands(Reg)) {
218         if (U.isImplicit())
219           return NSA_Status::FIXED;
220         const MachineInstr *UseInst = U.getParent();
221         if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
222           return NSA_Status::FIXED;
223       }
224 
225       if (!LIS->hasInterval(Reg))
226         return NSA_Status::FIXED;
227     }
228 
229     if (I == 0)
230       VgprBase = PhysReg;
231     else if (VgprBase + I != PhysReg)
232       NSA = true;
233   }
234 
235   return NSA ? NSA_Status::NON_CONTIGUOUS : NSA_Status::CONTIGUOUS;
236 }
237 
238 bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
239   ST = &MF.getSubtarget<GCNSubtarget>();
240   if (ST->getGeneration() < GCNSubtarget::GFX10)
241     return false;
242 
243   MRI = &MF.getRegInfo();
244   TRI = ST->getRegisterInfo();
245   VRM = &getAnalysis<VirtRegMap>();
246   LRM = &getAnalysis<LiveRegMatrix>();
247   LIS = &getAnalysis<LiveIntervals>();
248 
249   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
250   MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
251   MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(MFI->getOccupancy()), MaxNumVGPRs);
252   CSRegs = MRI->getCalleeSavedRegs();
253 
254   using Candidate = std::pair<const MachineInstr*, bool>;
255   SmallVector<Candidate, 32> Candidates;
256   for (const MachineBasicBlock &MBB : MF) {
257     for (const MachineInstr &MI : MBB) {
258       switch (CheckNSA(MI)) {
259       default:
260         continue;
261       case NSA_Status::CONTIGUOUS:
262         Candidates.push_back(std::pair(&MI, true));
263         break;
264       case NSA_Status::NON_CONTIGUOUS:
265         Candidates.push_back(std::pair(&MI, false));
266         ++NumNSAInstructions;
267         break;
268       }
269     }
270   }
271 
272   bool Changed = false;
273   for (auto &C : Candidates) {
274     if (C.second)
275       continue;
276 
277     const MachineInstr *MI = C.first;
278     if (CheckNSA(*MI, true) == NSA_Status::CONTIGUOUS) {
279       // Already happen to be fixed.
280       C.second = true;
281       ++NumNSAConverted;
282       continue;
283     }
284 
285     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI->getOpcode());
286     int VAddr0Idx =
287       AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::vaddr0);
288 
289     SmallVector<LiveInterval *, 16> Intervals;
290     SmallVector<MCRegister, 16> OrigRegs;
291     SlotIndex MinInd, MaxInd;
292     for (unsigned I = 0; I < Info->VAddrOperands; ++I) {
293       const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
294       Register Reg = Op.getReg();
295       LiveInterval *LI = &LIS->getInterval(Reg);
296       if (llvm::is_contained(Intervals, LI)) {
297         // Same register used, unable to make sequential
298         Intervals.clear();
299         break;
300       }
301       Intervals.push_back(LI);
302       OrigRegs.push_back(VRM->getPhys(Reg));
303       if (LI->empty()) {
304         // The address input is undef, so it doesn't contribute to the relevant
305         // range. Seed a reasonable index range if required.
306         if (I == 0)
307           MinInd = MaxInd = LIS->getInstructionIndex(*MI);
308         continue;
309       }
310       MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
311       MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
312     }
313 
314     if (Intervals.empty())
315       continue;
316 
317     LLVM_DEBUG(dbgs() << "Attempting to reassign NSA: " << *MI
318                       << "\tOriginal allocation:\t";
319                for (auto *LI
320                     : Intervals) dbgs()
321                << " " << llvm::printReg((VRM->getPhys(LI->reg())), TRI);
322                dbgs() << '\n');
323 
324     bool Success = scavengeRegs(Intervals);
325     if (!Success) {
326       LLVM_DEBUG(dbgs() << "\tCannot reallocate.\n");
327       if (VRM->hasPhys(Intervals.back()->reg())) // Did not change allocation.
328         continue;
329     } else {
330       // Check we did not make it worse for other instructions.
331       auto I = std::lower_bound(Candidates.begin(), &C, MinInd,
332                                 [this](const Candidate &C, SlotIndex I) {
333                                   return LIS->getInstructionIndex(*C.first) < I;
334                                 });
335       for (auto E = Candidates.end(); Success && I != E &&
336               LIS->getInstructionIndex(*I->first) < MaxInd; ++I) {
337         if (I->second && CheckNSA(*I->first, true) < NSA_Status::CONTIGUOUS) {
338           Success = false;
339           LLVM_DEBUG(dbgs() << "\tNSA conversion conflict with " << *I->first);
340         }
341       }
342     }
343 
344     if (!Success) {
345       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
346         if (VRM->hasPhys(Intervals[I]->reg()))
347           LRM->unassign(*Intervals[I]);
348 
349       for (unsigned I = 0; I < Info->VAddrOperands; ++I)
350         LRM->assign(*Intervals[I], OrigRegs[I]);
351 
352       continue;
353     }
354 
355     C.second = true;
356     ++NumNSAConverted;
357     LLVM_DEBUG(
358         dbgs() << "\tNew allocation:\t\t ["
359                << llvm::printReg((VRM->getPhys(Intervals.front()->reg())), TRI)
360                << " : "
361                << llvm::printReg((VRM->getPhys(Intervals.back()->reg())), TRI)
362                << "]\n");
363     Changed = true;
364   }
365 
366   return Changed;
367 }
368