1 //===------------------ AMDGPUCustomBehaviour.cpp ---------------*-C++ -* -===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 ///
10 /// This file implements methods from the AMDGPUCustomBehaviour class.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPUCustomBehaviour.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIInstrInfo.h"
17 #include "TargetInfo/AMDGPUTargetInfo.h"
18 #include "llvm/MC/TargetRegistry.h"
19 #include "llvm/Support/WithColor.h"
20 
21 namespace llvm {
22 namespace mca {
23 
24 void AMDGPUInstrPostProcess::postProcessInstruction(
25     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
26   switch (MCI.getOpcode()) {
27   case AMDGPU::S_WAITCNT:
28   case AMDGPU::S_WAITCNT_EXPCNT:
29   case AMDGPU::S_WAITCNT_LGKMCNT:
30   case AMDGPU::S_WAITCNT_VMCNT:
31   case AMDGPU::S_WAITCNT_VSCNT:
32   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
33   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
34   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
35   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
36   case AMDGPU::S_WAITCNT_gfx10:
37   case AMDGPU::S_WAITCNT_gfx6_gfx7:
38   case AMDGPU::S_WAITCNT_vi:
39     return processWaitCnt(Inst, MCI);
40   }
41 }
42 
43 // s_waitcnt instructions encode important information as immediate operands
44 // which are lost during the MCInst -> mca::Instruction lowering.
45 void AMDGPUInstrPostProcess::processWaitCnt(std::unique_ptr<Instruction> &Inst,
46                                             const MCInst &MCI) {
47   for (int Idx = 0, N = MCI.size(); Idx < N; Idx++) {
48     MCAOperand Op;
49     const MCOperand &MCOp = MCI.getOperand(Idx);
50     if (MCOp.isReg()) {
51       Op = MCAOperand::createReg(MCOp.getReg());
52     } else if (MCOp.isImm()) {
53       Op = MCAOperand::createImm(MCOp.getImm());
54     }
55     Op.setIndex(Idx);
56     Inst->addOperand(Op);
57   }
58 }
59 
60 AMDGPUCustomBehaviour::AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
61                                              const mca::SourceMgr &SrcMgr,
62                                              const MCInstrInfo &MCII)
63     : CustomBehaviour(STI, SrcMgr, MCII) {
64   generateWaitCntInfo();
65 }
66 
67 unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
68                                                   const InstRef &IR) {
69   const Instruction &Inst = *IR.getInstruction();
70   unsigned Opcode = Inst.getOpcode();
71 
72   // llvm-mca is generally run on fully compiled assembly so we wouldn't see any
73   // pseudo instructions here. However, there are plans for the future to make
74   // it possible to use mca within backend passes. As such, I have left the
75   // pseudo version of s_waitcnt within this switch statement.
76   switch (Opcode) {
77   default:
78     return 0;
79   case AMDGPU::S_WAITCNT: // This instruction
80   case AMDGPU::S_WAITCNT_EXPCNT:
81   case AMDGPU::S_WAITCNT_LGKMCNT:
82   case AMDGPU::S_WAITCNT_VMCNT:
83   case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
84   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
85   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
86   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
87   case AMDGPU::S_WAITCNT_VSCNT_gfx10:
88   case AMDGPU::S_WAITCNT_gfx10:
89   case AMDGPU::S_WAITCNT_gfx6_gfx7:
90   case AMDGPU::S_WAITCNT_vi:
91     // s_endpgm also behaves as if there is an implicit
92     // s_waitcnt 0, but I'm not sure if it would be appropriate
93     // to model this in llvm-mca based on how the iterations work
94     // while simulating the pipeline over and over.
95     return handleWaitCnt(IssuedInst, IR);
96   }
97 
98   return 0;
99 }
100 
101 unsigned AMDGPUCustomBehaviour::handleWaitCnt(ArrayRef<InstRef> IssuedInst,
102                                               const InstRef &IR) {
103   // Currently, all s_waitcnt instructions are handled except s_waitcnt_depctr.
104   // I do not know how that instruction works so I did not attempt to model it.
105   // set the max values to begin
106   unsigned Vmcnt = 63;
107   unsigned Expcnt = 7;
108   unsigned Lgkmcnt = 31;
109   unsigned Vscnt = 63;
110   unsigned CurrVmcnt = 0;
111   unsigned CurrExpcnt = 0;
112   unsigned CurrLgkmcnt = 0;
113   unsigned CurrVscnt = 0;
114   unsigned CyclesToWaitVm = ~0U;
115   unsigned CyclesToWaitExp = ~0U;
116   unsigned CyclesToWaitLgkm = ~0U;
117   unsigned CyclesToWaitVs = ~0U;
118 
119   computeWaitCnt(IR, Vmcnt, Expcnt, Lgkmcnt, Vscnt);
120 
121   // We will now look at each of the currently executing instructions
122   // to find out if this wait instruction still needs to wait.
123   for (const InstRef &PrevIR : IssuedInst) {
124     const Instruction &PrevInst = *PrevIR.getInstruction();
125     const unsigned PrevInstIndex = PrevIR.getSourceIndex() % SrcMgr.size();
126     const WaitCntInfo &PrevInstWaitInfo = InstrWaitCntInfo[PrevInstIndex];
127     const int CyclesLeft = PrevInst.getCyclesLeft();
128     assert(CyclesLeft != UNKNOWN_CYCLES &&
129            "We should know how many cycles are left for this instruction");
130     if (PrevInstWaitInfo.VmCnt) {
131       CurrVmcnt++;
132       if ((unsigned)CyclesLeft < CyclesToWaitVm)
133         CyclesToWaitVm = CyclesLeft;
134     }
135     if (PrevInstWaitInfo.ExpCnt) {
136       CurrExpcnt++;
137       if ((unsigned)CyclesLeft < CyclesToWaitExp)
138         CyclesToWaitExp = CyclesLeft;
139     }
140     if (PrevInstWaitInfo.LgkmCnt) {
141       CurrLgkmcnt++;
142       if ((unsigned)CyclesLeft < CyclesToWaitLgkm)
143         CyclesToWaitLgkm = CyclesLeft;
144     }
145     if (PrevInstWaitInfo.VsCnt) {
146       CurrVscnt++;
147       if ((unsigned)CyclesLeft < CyclesToWaitVs)
148         CyclesToWaitVs = CyclesLeft;
149     }
150   }
151 
152   unsigned CyclesToWait = ~0U;
153   if (CurrVmcnt > Vmcnt && CyclesToWaitVm < CyclesToWait)
154     CyclesToWait = CyclesToWaitVm;
155   if (CurrExpcnt > Expcnt && CyclesToWaitExp < CyclesToWait)
156     CyclesToWait = CyclesToWaitExp;
157   if (CurrLgkmcnt > Lgkmcnt && CyclesToWaitLgkm < CyclesToWait)
158     CyclesToWait = CyclesToWaitLgkm;
159   if (CurrVscnt > Vscnt && CyclesToWaitVs < CyclesToWait)
160     CyclesToWait = CyclesToWaitVs;
161 
162   // We may underestimate how many cycles we need to wait, but this
163   // isn't a big deal. Our return value is just how many cycles until
164   // this function gets run again. So as long as we don't overestimate
165   // the wait time, we'll still end up stalling at this instruction
166   // for the correct number of cycles.
167 
168   if (CyclesToWait == ~0U)
169     return 0;
170   return CyclesToWait;
171 }
172 
173 void AMDGPUCustomBehaviour::computeWaitCnt(const InstRef &IR, unsigned &Vmcnt,
174                                            unsigned &Expcnt, unsigned &Lgkmcnt,
175                                            unsigned &Vscnt) {
176   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
177   const Instruction &Inst = *IR.getInstruction();
178   unsigned Opcode = Inst.getOpcode();
179 
180   switch (Opcode) {
181   case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
182   case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
183   case AMDGPU::S_WAITCNT_VMCNT_gfx10:
184   case AMDGPU::S_WAITCNT_VSCNT_gfx10: {
185     // Should probably be checking for nullptr
186     // here, but I'm not sure how I should handle the case
187     // where we see a nullptr.
188     const MCAOperand *OpReg = Inst.getOperand(0);
189     const MCAOperand *OpImm = Inst.getOperand(1);
190     assert(OpReg && OpReg->isReg() && "First operand should be a register.");
191     assert(OpImm && OpImm->isImm() && "Second operand should be an immediate.");
192     if (OpReg->getReg() != AMDGPU::SGPR_NULL) {
193       // Instruction is using a real register.
194       // Since we can't know what value this register will have,
195       // we can't compute what the value of this wait should be.
196       WithColor::warning() << "The register component of "
197                            << MCII.getName(Opcode) << " will be completely "
198                            << "ignored. So the wait may not be accurate.\n";
199     }
200     switch (Opcode) {
201     // Redundant switch so I don't have to repeat the code above
202     // for each case. There are more clever ways to avoid this
203     // extra switch and anyone can feel free to implement one of them.
204     case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
205       Expcnt = OpImm->getImm();
206       break;
207     case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
208       Lgkmcnt = OpImm->getImm();
209       break;
210     case AMDGPU::S_WAITCNT_VMCNT_gfx10:
211       Vmcnt = OpImm->getImm();
212       break;
213     case AMDGPU::S_WAITCNT_VSCNT_gfx10:
214       Vscnt = OpImm->getImm();
215       break;
216     }
217     return;
218   }
219   case AMDGPU::S_WAITCNT_gfx10:
220   case AMDGPU::S_WAITCNT_gfx6_gfx7:
221   case AMDGPU::S_WAITCNT_vi:
222     unsigned WaitCnt = Inst.getOperand(0)->getImm();
223     AMDGPU::decodeWaitcnt(IV, WaitCnt, Vmcnt, Expcnt, Lgkmcnt);
224     return;
225   }
226 }
227 
228 void AMDGPUCustomBehaviour::generateWaitCntInfo() {
229   // The core logic from this function is taken from
230   // SIInsertWaitcnts::updateEventWaitcntAfter() In that pass, the instructions
231   // that are being looked at are in the MachineInstr format, whereas we have
232   // access to the MCInst format. The side effects of this are that we can't use
233   // the mayAccessVMEMThroughFlat(Inst) or mayAccessLDSThroughFlat(Inst)
234   // functions. Therefore, we conservatively assume that these functions will
235   // return true. This may cause a few instructions to be incorrectly tagged
236   // with an extra CNT. However, these are instructions that do interact with at
237   // least one CNT so giving them an extra CNT shouldn't cause issues in most
238   // scenarios.
239   AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(STI.getCPU());
240   InstrWaitCntInfo.resize(SrcMgr.size());
241 
242   int Index = 0;
243   for (auto I = SrcMgr.begin(), E = SrcMgr.end(); I != E; ++I, ++Index) {
244     const std::unique_ptr<Instruction> &Inst = *I;
245     unsigned Opcode = Inst->getOpcode();
246     const MCInstrDesc &MCID = MCII.get(Opcode);
247     if ((MCID.TSFlags & SIInstrFlags::DS) &&
248         (MCID.TSFlags & SIInstrFlags::LGKM_CNT)) {
249       InstrWaitCntInfo[Index].LgkmCnt = true;
250       if (isAlwaysGDS(Opcode) || hasModifiersSet(Inst, AMDGPU::OpName::gds))
251         InstrWaitCntInfo[Index].ExpCnt = true;
252     } else if (MCID.TSFlags & SIInstrFlags::FLAT) {
253       // We conservatively assume that mayAccessVMEMThroughFlat(Inst)
254       // and mayAccessLDSThroughFlat(Inst) would both return true for this
255       // instruction. We have to do this because those functions use
256       // information about the memory operands that we don't have access to.
257       InstrWaitCntInfo[Index].LgkmCnt = true;
258       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
259         InstrWaitCntInfo[Index].VmCnt = true;
260       else if (MCID.mayLoad() && !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet))
261         InstrWaitCntInfo[Index].VmCnt = true;
262       else
263         InstrWaitCntInfo[Index].VsCnt = true;
264     } else if (isVMEM(MCID) && !AMDGPU::getMUBUFIsBufferInv(Opcode)) {
265       if (!STI.hasFeature(AMDGPU::FeatureVscnt))
266         InstrWaitCntInfo[Index].VmCnt = true;
267       else if ((MCID.mayLoad() &&
268                 !(MCID.TSFlags & SIInstrFlags::IsAtomicNoRet)) ||
269                ((MCID.TSFlags & SIInstrFlags::MIMG) && !MCID.mayLoad() &&
270                 !MCID.mayStore()))
271         InstrWaitCntInfo[Index].VmCnt = true;
272       else if (MCID.mayStore())
273         InstrWaitCntInfo[Index].VsCnt = true;
274 
275       // (IV.Major < 7) is meant to represent
276       // GCNTarget.vmemWriteNeedsExpWaitcnt()
277       // which is defined as
278       // { return getGeneration() < SEA_ISLANDS; }
279       if (IV.Major < 7 &&
280           (MCID.mayStore() || (MCID.TSFlags & SIInstrFlags::IsAtomicRet)))
281         InstrWaitCntInfo[Index].ExpCnt = true;
282     } else if (MCID.TSFlags & SIInstrFlags::SMRD) {
283       InstrWaitCntInfo[Index].LgkmCnt = true;
284     } else if (MCID.TSFlags & SIInstrFlags::EXP) {
285       InstrWaitCntInfo[Index].ExpCnt = true;
286     } else {
287       switch (Opcode) {
288       case AMDGPU::S_SENDMSG:
289       case AMDGPU::S_SENDMSGHALT:
290       case AMDGPU::S_MEMTIME:
291       case AMDGPU::S_MEMREALTIME:
292         InstrWaitCntInfo[Index].LgkmCnt = true;
293         break;
294       }
295     }
296   }
297 }
298 
299 // taken from SIInstrInfo::isVMEM()
300 bool AMDGPUCustomBehaviour::isVMEM(const MCInstrDesc &MCID) {
301   return MCID.TSFlags & SIInstrFlags::MUBUF ||
302          MCID.TSFlags & SIInstrFlags::MTBUF ||
303          MCID.TSFlags & SIInstrFlags::MIMG;
304 }
305 
306 // taken from SIInstrInfo::hasModifiersSet()
307 bool AMDGPUCustomBehaviour::hasModifiersSet(
308     const std::unique_ptr<Instruction> &Inst, unsigned OpName) const {
309   int Idx = AMDGPU::getNamedOperandIdx(Inst->getOpcode(), OpName);
310   if (Idx == -1)
311     return false;
312 
313   const MCAOperand *Op = Inst->getOperand(Idx);
314   if (Op == nullptr || !Op->isImm() || !Op->getImm())
315     return false;
316 
317   return true;
318 }
319 
320 // taken from SIInstrInfo::isAlwaysGDS()
321 bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
322   return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
323          Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
324          Opcode == AMDGPU::DS_GWS_SEMA_P ||
325          Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
326          Opcode == AMDGPU::DS_GWS_BARRIER;
327 }
328 
329 } // namespace mca
330 } // namespace llvm
331 
332 using namespace llvm;
333 using namespace mca;
334 
335 static CustomBehaviour *
336 createAMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
337                             const mca::SourceMgr &SrcMgr,
338                             const MCInstrInfo &MCII) {
339   return new AMDGPUCustomBehaviour(STI, SrcMgr, MCII);
340 }
341 
342 static InstrPostProcess *
343 createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
344                              const MCInstrInfo &MCII) {
345   return new AMDGPUInstrPostProcess(STI, MCII);
346 }
347 
348 /// Extern function to initialize the targets for the AMDGPU backend
349 
350 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
351   TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
352                                           createAMDGPUCustomBehaviour);
353   TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
354                                            createAMDGPUInstrPostProcess);
355 
356   TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
357                                           createAMDGPUCustomBehaviour);
358   TargetRegistry::RegisterInstrPostProcess(getTheGCNTarget(),
359                                            createAMDGPUInstrPostProcess);
360 }
361