1 //===- AMDGPUInsertDelayAlu.cpp - Insert s_delay_alu instructions ---------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert s_delay_alu instructions to avoid stalls on GFX11+.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "GCNSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "llvm/ADT/SetVector.h"
19 
20 using namespace llvm;
21 
22 #define DEBUG_TYPE "amdgpu-insert-delay-alu"
23 
24 namespace {
25 
26 class AMDGPUInsertDelayAlu : public MachineFunctionPass {
27 public:
28   static char ID;
29 
30   const SIInstrInfo *SII;
31   const TargetRegisterInfo *TRI;
32 
33   TargetSchedModel SchedModel;
34 
35   AMDGPUInsertDelayAlu() : MachineFunctionPass(ID) {}
36 
37   void getAnalysisUsage(AnalysisUsage &AU) const override {
38     AU.setPreservesCFG();
39     MachineFunctionPass::getAnalysisUsage(AU);
40   }
41 
42   // Return true if MI waits for all outstanding VALU instructions to complete.
43   static bool instructionWaitsForVALU(const MachineInstr &MI) {
44     // These instruction types wait for VA_VDST==0 before issuing.
45     const uint64_t VA_VDST_0 = SIInstrFlags::DS | SIInstrFlags::EXP |
46                                SIInstrFlags::FLAT | SIInstrFlags::MIMG |
47                                SIInstrFlags::MTBUF | SIInstrFlags::MUBUF;
48     if (MI.getDesc().TSFlags & VA_VDST_0)
49       return true;
50     if (MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B32 ||
51         MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
52       return true;
53     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
54         AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
55       return true;
56     return false;
57   }
58 
59   // Types of delay that can be encoded in an s_delay_alu instruction.
60   enum DelayType { VALU, TRANS, SALU, OTHER };
61 
62   // Get the delay type for an instruction with the specified TSFlags.
63   static DelayType getDelayType(uint64_t TSFlags) {
64     if (TSFlags & SIInstrFlags::TRANS)
65       return TRANS;
66     if (TSFlags & SIInstrFlags::VALU)
67       return VALU;
68     if (TSFlags & SIInstrFlags::SALU)
69       return SALU;
70     return OTHER;
71   }
72 
73   // Information about the last instruction(s) that wrote to a particular
74   // regunit. In straight-line code there will only be one such instruction, but
75   // when control flow converges we merge the delay information from each path
76   // to represent the union of the worst-case delays of each type.
77   struct DelayInfo {
78     // One larger than the maximum number of (non-TRANS) VALU instructions we
79     // can encode in an s_delay_alu instruction.
80     static constexpr unsigned VALU_MAX = 5;
81 
82     // One larger than the maximum number of TRANS instructions we can encode in
83     // an s_delay_alu instruction.
84     static constexpr unsigned TRANS_MAX = 4;
85 
86     // One larger than the maximum number of SALU cycles we can encode in an
87     // s_delay_alu instruction.
88     static constexpr unsigned SALU_CYCLES_MAX = 4;
89 
90     // If it was written by a (non-TRANS) VALU, remember how many clock cycles
91     // are left until it completes, and how many other (non-TRANS) VALU we have
92     // seen since it was issued.
93     uint8_t VALUCycles = 0;
94     uint8_t VALUNum = VALU_MAX;
95 
96     // If it was written by a TRANS, remember how many clock cycles are left
97     // until it completes, and how many other TRANS we have seen since it was
98     // issued.
99     uint8_t TRANSCycles = 0;
100     uint8_t TRANSNum = TRANS_MAX;
101     // Also remember how many other (non-TRANS) VALU we have seen since it was
102     // issued. When an instruction depends on both a prior TRANS and a prior
103     // non-TRANS VALU, this is used to decide whether to encode a wait for just
104     // one or both of them.
105     uint8_t TRANSNumVALU = VALU_MAX;
106 
107     // If it was written by an SALU, remember how many clock cycles are left
108     // until it completes.
109     uint8_t SALUCycles = 0;
110 
111     DelayInfo() = default;
112 
113     DelayInfo(DelayType Type, unsigned Cycles) {
114       switch (Type) {
115       default:
116         llvm_unreachable("unexpected type");
117       case VALU:
118         VALUCycles = Cycles;
119         VALUNum = 0;
120         break;
121       case TRANS:
122         TRANSCycles = Cycles;
123         TRANSNum = 0;
124         TRANSNumVALU = 0;
125         break;
126       case SALU:
127         // Guard against pseudo-instructions like SI_CALL which are marked as
128         // SALU but with a very high latency.
129         SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
130         break;
131       }
132     }
133 
134     bool operator==(const DelayInfo &RHS) const {
135       return VALUCycles == RHS.VALUCycles && VALUNum == RHS.VALUNum &&
136              TRANSCycles == RHS.TRANSCycles && TRANSNum == RHS.TRANSNum &&
137              TRANSNumVALU == RHS.TRANSNumVALU && SALUCycles == RHS.SALUCycles;
138     }
139 
140     bool operator!=(const DelayInfo &RHS) const { return !(*this == RHS); }
141 
142     // Merge another DelayInfo into this one, to represent the union of the
143     // worst-case delays of each type.
144     void merge(const DelayInfo &RHS) {
145       VALUCycles = std::max(VALUCycles, RHS.VALUCycles);
146       VALUNum = std::min(VALUNum, RHS.VALUNum);
147       TRANSCycles = std::max(TRANSCycles, RHS.TRANSCycles);
148       TRANSNum = std::min(TRANSNum, RHS.TRANSNum);
149       TRANSNumVALU = std::min(TRANSNumVALU, RHS.TRANSNumVALU);
150       SALUCycles = std::max(SALUCycles, RHS.SALUCycles);
151     }
152 
153     // Update this DelayInfo after issuing an instruction. IsVALU should be 1
154     // when issuing a (non-TRANS) VALU, else 0. IsTRANS should be 1 when issuing
155     // a TRANS, else 0. Cycles is the number of cycles it takes to issue the
156     // instruction.  Return true if there is no longer any useful delay info.
157     bool advance(DelayType Type, unsigned Cycles) {
158       bool Erase = true;
159 
160       VALUNum += (Type == VALU);
161       if (VALUNum >= VALU_MAX || VALUCycles <= Cycles) {
162         // Forget about the VALU instruction. It was too far back or has
163         // definitely completed by now.
164         VALUNum = VALU_MAX;
165         VALUCycles = 0;
166       } else {
167         VALUCycles -= Cycles;
168         Erase = false;
169       }
170 
171       TRANSNum += (Type == TRANS);
172       TRANSNumVALU += (Type == VALU);
173       if (TRANSNum >= TRANS_MAX || TRANSCycles <= Cycles) {
174         // Forget about any TRANS instruction. It was too far back or has
175         // definitely completed by now.
176         TRANSNum = TRANS_MAX;
177         TRANSNumVALU = VALU_MAX;
178         TRANSCycles = 0;
179       } else {
180         TRANSCycles -= Cycles;
181         Erase = false;
182       }
183 
184       if (SALUCycles <= Cycles) {
185         // Forget about any SALU instruction. It has definitely completed by
186         // now.
187         SALUCycles = 0;
188       } else {
189         SALUCycles -= Cycles;
190         Erase = false;
191       }
192 
193       return Erase;
194     }
195 
196 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
197     void dump() const {
198       if (VALUCycles)
199         dbgs() << " VALUCycles=" << (int)VALUCycles;
200       if (VALUNum < VALU_MAX)
201         dbgs() << " VALUNum=" << (int)VALUNum;
202       if (TRANSCycles)
203         dbgs() << " TRANSCycles=" << (int)TRANSCycles;
204       if (TRANSNum < TRANS_MAX)
205         dbgs() << " TRANSNum=" << (int)TRANSNum;
206       if (TRANSNumVALU < VALU_MAX)
207         dbgs() << " TRANSNumVALU=" << (int)TRANSNumVALU;
208       if (SALUCycles)
209         dbgs() << " SALUCycles=" << (int)SALUCycles;
210     }
211 #endif
212   };
213 
214   // A map from regunits to the delay info for that regunit.
215   struct DelayState : DenseMap<unsigned, DelayInfo> {
216     // Merge another DelayState into this one by merging the delay info for each
217     // regunit.
218     void merge(const DelayState &RHS) {
219       for (const auto &KV : RHS) {
220         iterator It;
221         bool Inserted;
222         std::tie(It, Inserted) = insert(KV);
223         if (!Inserted)
224           It->second.merge(KV.second);
225       }
226     }
227 
228     // Advance the delay info for each regunit, erasing any that are no longer
229     // useful.
230     void advance(DelayType Type, unsigned Cycles) {
231       iterator Next;
232       for (auto I = begin(), E = end(); I != E; I = Next) {
233         Next = std::next(I);
234         if (I->second.advance(Type, Cycles))
235           erase(I);
236       }
237     }
238 
239 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
240     void dump(const TargetRegisterInfo *TRI) const {
241       if (empty()) {
242         dbgs() << "    empty\n";
243         return;
244       }
245 
246       // Dump DelayInfo for each RegUnit in numerical order.
247       SmallVector<const_iterator, 8> Order;
248       Order.reserve(size());
249       for (const_iterator I = begin(), E = end(); I != E; ++I)
250         Order.push_back(I);
251       llvm::sort(Order, [](const const_iterator &A, const const_iterator &B) {
252         return A->first < B->first;
253       });
254       for (const_iterator I : Order) {
255         dbgs() << "    " << printRegUnit(I->first, TRI);
256         I->second.dump();
257         dbgs() << "\n";
258       }
259     }
260 #endif
261   };
262 
263   // The saved delay state at the end of each basic block.
264   DenseMap<MachineBasicBlock *, DelayState> BlockState;
265 
266   // Emit an s_delay_alu instruction if necessary before MI.
267   MachineInstr *emitDelayAlu(MachineInstr &MI, DelayInfo Delay,
268                              MachineInstr *LastDelayAlu) {
269     unsigned Imm = 0;
270 
271     // Wait for a TRANS instruction.
272     if (Delay.TRANSNum < DelayInfo::TRANS_MAX)
273       Imm |= 4 + Delay.TRANSNum;
274 
275     // Wait for a VALU instruction (if it's more recent than any TRANS
276     // instruction that we're also waiting for).
277     if (Delay.VALUNum < DelayInfo::VALU_MAX &&
278         Delay.VALUNum <= Delay.TRANSNumVALU) {
279       if (Imm & 0xf)
280         Imm |= Delay.VALUNum << 7;
281       else
282         Imm |= Delay.VALUNum;
283     }
284 
285     // Wait for an SALU instruction.
286     if (Delay.SALUCycles) {
287       assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
288       if (Imm & 0x780) {
289         // We have already encoded a VALU and a TRANS delay. There's no room in
290         // the encoding for an SALU delay as well, so just drop it.
291       } else if (Imm & 0xf) {
292         Imm |= (Delay.SALUCycles + 8) << 7;
293       } else {
294         Imm |= Delay.SALUCycles + 8;
295       }
296     }
297 
298     // Don't emit the s_delay_alu instruction if there's nothing to wait for.
299     if (!Imm)
300       return LastDelayAlu;
301 
302     // If we only need to wait for one instruction, try encoding it in the last
303     // s_delay_alu that we emitted.
304     if (!(Imm & 0x780) && LastDelayAlu) {
305       unsigned Skip = 0;
306       for (auto I = MachineBasicBlock::instr_iterator(LastDelayAlu),
307                 E = MachineBasicBlock::instr_iterator(MI);
308            ++I != E;) {
309         if (!I->isBundle() && !I->isMetaInstruction())
310           ++Skip;
311       }
312       if (Skip < 6) {
313         MachineOperand &Op = LastDelayAlu->getOperand(0);
314         unsigned LastImm = Op.getImm();
315         assert((LastImm & ~0xf) == 0 &&
316                "Remembered an s_delay_alu with no room for another delay!");
317         LastImm |= Imm << 7 | Skip << 4;
318         Op.setImm(LastImm);
319         return nullptr;
320       }
321     }
322 
323     auto &MBB = *MI.getParent();
324     MachineInstr *DelayAlu =
325         BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_DELAY_ALU)).addImm(Imm);
326     // Remember the s_delay_alu for next time if there is still room in it to
327     // encode another delay.
328     return (Imm & 0x780) ? nullptr : DelayAlu;
329   }
330 
331   bool runOnMachineBasicBlock(MachineBasicBlock &MBB, bool Emit) {
332     DelayState State;
333     for (auto *Pred : MBB.predecessors())
334       State.merge(BlockState[Pred]);
335 
336     LLVM_DEBUG(dbgs() << "  State at start of " << printMBBReference(MBB)
337                       << "\n";
338                State.dump(TRI););
339 
340     bool Changed = false;
341     MachineInstr *LastDelayAlu = nullptr;
342 
343     // Iterate over the contents of bundles, but don't emit any instructions
344     // inside a bundle.
345     for (auto &MI : MBB.instrs()) {
346       if (MI.isBundle() || MI.isMetaInstruction())
347         continue;
348 
349       // Ignore some more instructions that do not generate any code.
350       switch (MI.getOpcode()) {
351       case AMDGPU::SI_RETURN_TO_EPILOG:
352         continue;
353       }
354 
355       DelayType Type = getDelayType(MI.getDesc().TSFlags);
356 
357       if (instructionWaitsForVALU(MI)) {
358         // Forget about all outstanding VALU delays.
359         // TODO: This is overkill since it also forgets about SALU delays.
360         State = DelayState();
361       } else if (Type != OTHER) {
362         DelayInfo Delay;
363         // TODO: Scan implicit uses too?
364         for (const auto &Op : MI.explicit_uses()) {
365           if (Op.isReg()) {
366             // One of the operands of the writelane is also the output operand.
367             // This creates the insertion of redundant delays. Hence, we have to
368             // ignore this operand.
369             if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
370               continue;
371             for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
372               auto It = State.find(Unit);
373               if (It != State.end()) {
374                 Delay.merge(It->second);
375                 State.erase(Unit);
376               }
377             }
378           }
379         }
380         if (Emit && !MI.isBundledWithPred()) {
381           // TODO: For VALU->SALU delays should we use s_delay_alu or s_nop or
382           // just ignore them?
383           LastDelayAlu = emitDelayAlu(MI, Delay, LastDelayAlu);
384         }
385       }
386 
387       if (Type != OTHER) {
388         // TODO: Scan implicit defs too?
389         for (const auto &Op : MI.defs()) {
390           unsigned Latency = SchedModel.computeOperandLatency(
391               &MI, Op.getOperandNo(), nullptr, 0);
392           for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
393             State[Unit] = DelayInfo(Type, Latency);
394         }
395       }
396 
397       // Advance by the number of cycles it takes to issue this instruction.
398       // TODO: Use a more advanced model that accounts for instructions that
399       // take multiple cycles to issue on a particular pipeline.
400       unsigned Cycles = SIInstrInfo::getNumWaitStates(MI);
401       // TODO: In wave64 mode, double the number of cycles for VALU and VMEM
402       // instructions on the assumption that they will usually have to be issued
403       // twice?
404       State.advance(Type, Cycles);
405 
406       LLVM_DEBUG(dbgs() << "  State after " << MI; State.dump(TRI););
407     }
408 
409     if (Emit) {
410       assert(State == BlockState[&MBB] &&
411              "Basic block state should not have changed on final pass!");
412     } else if (State != BlockState[&MBB]) {
413       BlockState[&MBB] = std::move(State);
414       Changed = true;
415     }
416     return Changed;
417   }
418 
419   bool runOnMachineFunction(MachineFunction &MF) override {
420     if (skipFunction(MF.getFunction()))
421       return false;
422 
423     LLVM_DEBUG(dbgs() << "AMDGPUInsertDelayAlu running on " << MF.getName()
424                       << "\n");
425 
426     const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
427     if (!ST.hasDelayAlu())
428       return false;
429 
430     SII = ST.getInstrInfo();
431     TRI = ST.getRegisterInfo();
432 
433     SchedModel.init(&ST);
434 
435     // Calculate the delay state for each basic block, iterating until we reach
436     // a fixed point.
437     SetVector<MachineBasicBlock *> WorkList;
438     for (auto &MBB : reverse(MF))
439       WorkList.insert(&MBB);
440     while (!WorkList.empty()) {
441       auto &MBB = *WorkList.pop_back_val();
442       bool Changed = runOnMachineBasicBlock(MBB, false);
443       if (Changed)
444         WorkList.insert(MBB.succ_begin(), MBB.succ_end());
445     }
446 
447     LLVM_DEBUG(dbgs() << "Final pass over all BBs\n");
448 
449     // Make one last pass over all basic blocks to emit s_delay_alu
450     // instructions.
451     bool Changed = false;
452     for (auto &MBB : MF)
453       Changed |= runOnMachineBasicBlock(MBB, true);
454     return Changed;
455   }
456 };
457 
458 } // namespace
459 
460 char AMDGPUInsertDelayAlu::ID = 0;
461 
462 char &llvm::AMDGPUInsertDelayAluID = AMDGPUInsertDelayAlu::ID;
463 
464 INITIALIZE_PASS(AMDGPUInsertDelayAlu, DEBUG_TYPE, "AMDGPU Insert Delay ALU",
465                 false, false)
466