1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "Utils/AMDGPUBaseInfo.h"
17 #include "TargetInfo/AMDGPUTargetInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20 
21 namespace llvm {
22 namespace mca {
23 
24 void AMDGPUInstrPostProcess::postProcessInstruction(
25     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26   switch (MCI.getOpcode()) {
27   case AMDGPU::S_WAITCNT:
28   case AMDGPU::S_WAITCNT_soft:
29   case AMDGPU::S_WAITCNT_EXPCNT:
30   case AMDGPU::S_WAITCNT_LGKMCNT:
31   case AMDGPU::S_WAITCNT_VMCNT:
32   case AMDGPU::S_WAITCNT_VSCNT:
33   case AMDGPU::S_WAITCNT_VSCNT_soft:
34   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
35   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
36   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
37   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
38   case AMDGPU::S_WAITCNT_gfx10:
39   case AMDGPU::S_WAITCNT_gfx6_gfx7:
40   case AMDGPU::S_WAITCNT_vi:
41     return processWaitCnt(Inst, MCI);
42   }
43 }
44 
45 // s_waitcnt instructions encode important information as immediate operands
46 // which are lost during the MCInst -> mca::Instruction lowering.
47 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
48                                             const MCInst &MCI) {
49   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
50     MCAOperand Op;
51     const MCOperand &MCOp = MCI.getOperand(Idx);
52     if (MCOp.isReg()) {
53       Op = MCAOperand::createReg(MCOp.getReg());
54     } else if (MCOp.isImm()) {
55       Op = MCAOperand::createImm(MCOp.getImm());
56     }
57     Op.setIndex(Idx);
58     Inst->addOperand(Op);
59   }
60 }
61 
62 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
63                                              const mca::SourceMgr &SrcMgr,
64                                              const MCInstrInfo &MCII)
65     : CustomBehaviour(STI, SrcMgr, MCII) {
66   generateWaitCntInfo();
67 }
68 
69 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
70                                                   const InstRef &IR) {
71   const Instruction &Inst = *IR.getInstruction();
72   unsigned Opcode = Inst.getOpcode();
73 
74   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
75   // pseudo instructions here. However, there are plans for the future to make
76   // it possible to use mca within backend passes. As such, I have left the
77   // pseudo version of s_waitcnt within this switch statement.
78   switch (Opcode) {
79   default:
80     return 0;
81   case AMDGPU::S_WAITCNT: // This instruction
82   case AMDGPU::S_WAITCNT_soft:
83   case AMDGPU::S_WAITCNT_EXPCNT:
84   case AMDGPU::S_WAITCNT_LGKMCNT:
85   case AMDGPU::S_WAITCNT_VMCNT:
86   case AMDGPU::S_WAITCNT_VSCNT:
87   case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
88   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
89   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
90   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
91   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
92   case AMDGPU::S_WAITCNT_gfx10:
93   case AMDGPU::S_WAITCNT_gfx6_gfx7:
94   case AMDGPU::S_WAITCNT_vi:
95     // s_endpgm also behaves as if there is an implicit
96     // s_waitcnt 0, but I'm not sure if it would be appropriate
97     // to model this in llvm-mca based on how the iterations work
98     // while simulating the pipeline over and over.
99     return handleWaitCnt(IssuedInst, IR);
100   }
101 
102   return 0;
103 }
104 
105 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
106                                               const InstRef &IR) {
107   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
108   // I do not know how that instruction works so I did not attempt to model it.
109   // set the max values to begin
110   unsigned Vmcnt = 63;
111   unsigned Expcnt = 7;
112   unsigned Lgkmcnt = 31;
113   unsigned Vscnt = 63;
114   unsigned CurrVmcnt = 0;
115   unsigned CurrExpcnt = 0;
116   unsigned CurrLgkmcnt = 0;
117   unsigned CurrVscnt = 0;
118   unsigned CyclesToWaitVm = ~0U;
119   unsigned CyclesToWaitExp = ~0U;
120   unsigned CyclesToWaitLgkm = ~0U;
121   unsigned CyclesToWaitVs = ~0U;
122 
123   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
124 
125   // We will now look at each of the currently executing instructions
126   // to find out if this wait instruction still needs to wait.
127   for (const InstRef &PrevIR : IssuedInst) {
128     const Instruction &PrevInst = *PrevIR.getInstruction();
129     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
130     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
131     const int CyclesLeft = PrevInst.getCyclesLeft();
132     assert(CyclesLeft != UNKNOWN_CYCLES &&
133            "We should know how many cycles are left for this instruction");
134     if (PrevInstWaitInfo.VmCnt) {
135       CurrVmcnt++;
136       if ((unsigned)CyclesLeft < CyclesToWaitVm)
137         CyclesToWaitVm = CyclesLeft;
138     }
139     if (PrevInstWaitInfo.ExpCnt) {
140       CurrExpcnt++;
141       if ((unsigned)CyclesLeft < CyclesToWaitExp)
142         CyclesToWaitExp = CyclesLeft;
143     }
144     if (PrevInstWaitInfo.LgkmCnt) {
145       CurrLgkmcnt++;
146       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
147         CyclesToWaitLgkm = CyclesLeft;
148     }
149     if (PrevInstWaitInfo.VsCnt) {
150       CurrVscnt++;
151       if ((unsigned)CyclesLeft < CyclesToWaitVs)
152         CyclesToWaitVs = CyclesLeft;
153     }
154   }
155 
156   unsigned CyclesToWait = ~0U;
157   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
158     CyclesToWait = CyclesToWaitVm;
159   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
160     CyclesToWait = CyclesToWaitExp;
161   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
162     CyclesToWait = CyclesToWaitLgkm;
163   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
164     CyclesToWait = CyclesToWaitVs;
165 
166   // We may underestimate how many cycles we need to wait, but this
167   // isn't a big deal. Our return value is just how many cycles until
168   // this function gets run again. So as long as we don't overestimate
169   // the wait time, we'll still end up stalling at this instruction
170   // for the correct number of cycles.
171 
172   if (CyclesToWait == ~0U)
173     return 0;
174   return CyclesToWait;
175 }
176 
177 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
178                                            unsigned &Expcnt, unsigned &Lgkmcnt,
179                                            unsigned &Vscnt) {
180   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
181   const Instruction &Inst = *IR.getInstruction();
182   unsigned Opcode = Inst.getOpcode();
183 
184   switch (Opcode) {
185   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
186   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
187   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
188   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
189     // Should probably be checking for nullptr
190     // here, but I'm not sure how I should handle the case
191     // where we see a nullptr.
192     const MCAOperand *OpReg = Inst.getOperand(0);
193     const MCAOperand *OpImm = Inst.getOperand(1);
194     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
195     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
196     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
197       // Instruction is using a real register.
198       // Since we can't know what value this register will have,
199       // we can't compute what the value of this wait should be.
200       WithColor::warning() << "The register component of "
201                            << MCII.getName(Opcode) << " will be completely "
202                            << "ignored. So the wait may not be accurate.\n";
203     }
204     switch (Opcode) {
205     // Redundant switch so I don't have to repeat the code above
206     // for each case. There are more clever ways to avoid this
207     // extra switch and anyone can feel free to implement one of them.
208     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
209       Expcnt = OpImm->getImm();
210       break;
211     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
212       Lgkmcnt = OpImm->getImm();
213       break;
214     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
215       Vmcnt = OpImm->getImm();
216       break;
217     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
218       Vscnt = OpImm->getImm();
219       break;
220     }
221     return;
222   }
223   case AMDGPU::S_WAITCNT_gfx10:
224   case AMDGPU::S_WAITCNT_gfx6_gfx7:
225   case AMDGPU::S_WAITCNT_vi:
226     unsigned WaitCnt = Inst.getOperand(0)->getImm();
227     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
228     return;
229   }
230 }
231 
232 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
233   // The core logic from this function is taken from
234   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
235   // that are being looked at are in the MachineInstr format, whereas we have
236   // access to the MCInst format. The side effects of this are that we can't use
237   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
238   // functions. Therefore, we conservatively assume that these functions will
239   // return true. This may cause a few instructions to be incorrectly tagged
240   // with an extra CNT. However, these are instructions that do interact with at
241   // least one CNT so giving them an extra CNT shouldn't cause issues in most
242   // scenarios.
243   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
244   InstrWaitCntInfo.resize(SrcMgr.size());
245 
246   for (const auto &EN : llvm::enumerate(SrcMgr.getInstructions())) {
247     const std::unique_ptr<Instruction> &Inst = EN.value();
248     unsigned Index = EN.index();
249     unsigned Opcode = Inst->getOpcode();
250     const MCInstrDesc &MCID = MCII.get(Opcode);
251     if ((MCID.TSFlags & SIInstrFlags::DS) &&
252         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
253       InstrWaitCntInfo[Index].LgkmCnt = true;
254       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
255         InstrWaitCntInfo[Index].ExpCnt = true;
256     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
257       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
258       // and mayAccessLDSThroughFlat(Inst) would both return true for this
259       // instruction. We have to do this because those functions use
260       // information about the memory operands that we don't have access to.
261       InstrWaitCntInfo[Index].LgkmCnt = true;
262       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
263         InstrWaitCntInfo[Index].VmCnt = true;
264       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
265         InstrWaitCntInfo[Index].VmCnt = true;
266       else
267         InstrWaitCntInfo[Index].VsCnt = true;
268     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
269       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
270         InstrWaitCntInfo[Index].VmCnt = true;
271       else if ((MCID.mayLoad() &&
272                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
273                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
274                 !MCID.mayStore()))
275         InstrWaitCntInfo[Index].VmCnt = true;
276       else if (MCID.mayStore())
277         InstrWaitCntInfo[Index].VsCnt = true;
278 
279       // (IV.Major < 7) is meant to represent
280       // GCNTarget.vmemWriteNeedsExpWaitcnt()
281       // which is defined as
282       // { return getGeneration() < SEA_ISLANDS; }
283       if (IV.Major < 7 &&
284           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
285         InstrWaitCntInfo[Index].ExpCnt = true;
286     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
287       InstrWaitCntInfo[Index].LgkmCnt = true;
288     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
289       InstrWaitCntInfo[Index].ExpCnt = true;
290     } else {
291       switch (Opcode) {
292       case AMDGPU::S_SENDMSG:
293       case AMDGPU::S_SENDMSGHALT:
294       case AMDGPU::S_MEMTIME:
295       case AMDGPU::S_MEMREALTIME:
296         InstrWaitCntInfo[Index].LgkmCnt = true;
297         break;
298       }
299     }
300   }
301 }
302 
303 // taken from SIInstrInfo::isVMEM()
304 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
305   return MCID.TSFlags & SIInstrFlags::MUBUF ||
306          MCID.TSFlags & SIInstrFlags::MTBUF ||
307          MCID.TSFlags & SIInstrFlags::MIMG;
308 }
309 
310 // taken from SIInstrInfo::hasModifiersSet()
311 bool AMDGPUCustomBehaviour::hasModifiersSet(
312     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
313   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
314   if (Idx == -1)
315     return false;
316 
317   const MCAOperand *Op = Inst->getOperand(Idx);
318   if (Op == nullptr || !Op->isImm() || !Op->getImm())
319     return false;
320 
321   return true;
322 }
323 
324 // taken from SIInstrInfo::isGWS()
325 bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
326   const MCInstrDesc &MCID = MCII.get(Opcode);
327   return MCID.TSFlags & SIInstrFlags::GWS;
328 }
329 
330 // taken from SIInstrInfo::isAlwaysGDS()
331 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
332   return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
333 }
334 
335 } // namespace mca
336 } // namespace llvm
337 
338 using namespace llvm;
339 using namespace mca;
340 
341 static CustomBehaviour *
342 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
343                             const mca::SourceMgr &SrcMgr,
344                             const MCInstrInfo &MCII) {
345   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
346 }
347 
348 static InstrPostProcess *
349 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
350                              const MCInstrInfo &MCII) {
351   return new AMDGPUInstrPostProcess(STI, MCII);
352 }
353 
354 /// Extern function to initialize the targets for the AMDGPU backend
355 
356 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
357   TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
358                                           createAMDGPUCustomBehaviour);
359   TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
360                                            createAMDGPUInstrPostProcess);
361 
362   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
363                                           createAMDGPUCustomBehaviour);
364   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
365                                            createAMDGPUInstrPostProcess);
366 }
367