1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/TargetParser/TargetParser.h"
20 
21 using namespace llvm;
22 
23 namespace {
24 
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
26   MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27 
28   bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29     if (Arg.getAsInteger(0, Value))
30       return O.error("'" + Arg + "' value invalid for uint argument!");
31 
32     if (Value > 100)
33       return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34 
35     return false;
36   }
37 };
38 
39 } // end anonymous namespace
40 
41 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42     MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43                      cl::desc("Fill a percentage of the latency between "
44                               "neighboring MFMA with s_nops."));
45 
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49 
50 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51                                                  const GCNSubtarget &ST);
52 
53 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54   IsHazardRecognizerMode(false),
55   CurrCycleInstr(nullptr),
56   MF(MF),
57   ST(MF.getSubtarget<GCNSubtarget>()),
58   TII(*ST.getInstrInfo()),
59   TRI(TII.getRegisterInfo()),
60   ClauseUses(TRI.getNumRegUnits()),
61   ClauseDefs(TRI.getNumRegUnits()) {
62   MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63   TSchedModel.init(&ST);
64   RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66 
67 void GCNHazardRecognizer::Reset() {
68   EmittedInstrs.clear();
69 }
70 
71 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72   EmitInstruction(SU->getInstr());
73 }
74 
75 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76   CurrCycleInstr = MI;
77 }
78 
79 static bool isDivFMas(unsigned Opcode) {
80   return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82 
83 static bool isSGetReg(unsigned Opcode) {
84   return Opcode == AMDGPU::S_GETREG_B32;
85 }
86 
87 static bool isSSetReg(unsigned Opcode) {
88   switch (Opcode) {
89   case AMDGPU::S_SETREG_B32:
90   case AMDGPU::S_SETREG_B32_mode:
91   case AMDGPU::S_SETREG_IMM32_B32:
92   case AMDGPU::S_SETREG_IMM32_B32_mode:
93     return true;
94   }
95   return false;
96 }
97 
98 static bool isRWLane(unsigned Opcode) {
99   return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101 
102 static bool isRFE(unsigned Opcode) {
103   return Opcode == AMDGPU::S_RFE_B64;
104 }
105 
106 static bool isSMovRel(unsigned Opcode) {
107   switch (Opcode) {
108   case AMDGPU::S_MOVRELS_B32:
109   case AMDGPU::S_MOVRELS_B64:
110   case AMDGPU::S_MOVRELD_B32:
111   case AMDGPU::S_MOVRELD_B64:
112     return true;
113   default:
114     return false;
115   }
116 }
117 
118 static bool isDGEMM(unsigned Opcode) {
119   return AMDGPU::getMAIIsDGEMM(Opcode);
120 }
121 
122 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123   unsigned Opcode = MI.getOpcode();
124 
125   if (!SIInstrInfo::isMAI(MI) ||
126       isDGEMM(Opcode) ||
127       Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128       Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129     return false;
130 
131   if (!ST.hasGFX940Insts())
132     return true;
133 
134   return AMDGPU::getMAIIsGFX940XDL(Opcode);
135 }
136 
137 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138                                     const MachineInstr &MI) {
139   if (TII.isAlwaysGDS(MI.getOpcode()))
140     return true;
141 
142   switch (MI.getOpcode()) {
143   case AMDGPU::S_SENDMSG:
144   case AMDGPU::S_SENDMSGHALT:
145   case AMDGPU::S_TTRACEDATA:
146     return true;
147   // These DS opcodes don't support GDS.
148   case AMDGPU::DS_NOP:
149   case AMDGPU::DS_PERMUTE_B32:
150   case AMDGPU::DS_BPERMUTE_B32:
151     return false;
152   default:
153     if (TII.isDS(MI.getOpcode())) {
154       int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155                                            AMDGPU::OpName::gds);
156       if (MI.getOperand(GDS).getImm())
157         return true;
158     }
159     return false;
160   }
161 }
162 
163 static bool isPermlane(const MachineInstr &MI) {
164   unsigned Opcode = MI.getOpcode();
165   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166          Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167 }
168 
169 static bool isLdsDma(const MachineInstr &MI) {
170   return SIInstrInfo::isVALU(MI) &&
171          (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
172 }
173 
174 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
175   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
176                                                      AMDGPU::OpName::simm16);
177   return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
178 }
179 
180 ScheduleHazardRecognizer::HazardType
181 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
182   MachineInstr *MI = SU->getInstr();
183   // If we are not in "HazardRecognizerMode" and therefore not being run from
184   // the scheduler, track possible stalls from hazards but don't insert noops.
185   auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186 
187   if (MI->isBundle())
188    return NoHazard;
189 
190   if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191     return HazardType;
192 
193   if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194     return HazardType;
195 
196   if (checkFPAtomicToDenormModeHazard(MI) > 0)
197     return HazardType;
198 
199   if (ST.hasNoDataDepHazard())
200     return NoHazard;
201 
202   // FIXME: Should flat be considered vmem?
203   if ((SIInstrInfo::isVMEM(*MI) ||
204        SIInstrInfo::isFLAT(*MI))
205       && checkVMEMHazards(MI) > 0)
206     return HazardType;
207 
208   if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209     return HazardType;
210 
211   if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212     return HazardType;
213 
214   if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215     return HazardType;
216 
217   if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218     return HazardType;
219 
220   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
221        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
222        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223     return HazardType;
224 
225   if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226     return HazardType;
227 
228   if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229     return HazardType;
230 
231   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232     return HazardType;
233 
234   if (((ST.hasReadM0MovRelInterpHazard() &&
235         (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
236          MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
237          MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
238        (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
239        (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
240        (ST.hasReadM0LdsDirectHazard() &&
241         MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
242       checkReadM0Hazards(MI) > 0)
243     return HazardType;
244 
245   if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
246     return HazardType;
247 
248   if ((SIInstrInfo::isVMEM(*MI) ||
249        SIInstrInfo::isFLAT(*MI) ||
250        SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
251     return HazardType;
252 
253   if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
254     return HazardType;
255 
256   return NoHazard;
257 }
258 
259 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
260                                 unsigned Quantity) {
261   while (Quantity > 0) {
262     unsigned Arg = std::min(Quantity, 8u);
263     Quantity -= Arg;
264     BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
265         .addImm(Arg - 1);
266   }
267 }
268 
269 unsigned
270 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
271   const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
272   assert(TSchedModel.getWriteProcResBegin(SC) !=
273          TSchedModel.getWriteProcResEnd(SC));
274   return TSchedModel.getWriteProcResBegin(SC)->Cycles;
275 }
276 
277 void GCNHazardRecognizer::processBundle() {
278   MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
279   MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
280   // Check bundled MachineInstr's for hazards.
281   for (; MI != E && MI->isInsideBundle(); ++MI) {
282     CurrCycleInstr = &*MI;
283     unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
284 
285     if (IsHazardRecognizerMode) {
286       fixHazards(CurrCycleInstr);
287 
288       insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
289     }
290 
291     // It’s unnecessary to track more than MaxLookAhead instructions. Since we
292     // include the bundled MI directly after, only add a maximum of
293     // (MaxLookAhead - 1) noops to EmittedInstrs.
294     for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
295       EmittedInstrs.push_front(nullptr);
296 
297     EmittedInstrs.push_front(CurrCycleInstr);
298     EmittedInstrs.resize(MaxLookAhead);
299   }
300   CurrCycleInstr = nullptr;
301 }
302 
303 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
304   assert(IsHazardRecognizerMode);
305 
306   unsigned NumPreNoops = PreEmitNoops(MI);
307   EmitNoops(NumPreNoops);
308   if (MI->isInsideBundle())
309     insertNoopsInBundle(MI, TII, NumPreNoops);
310   else
311     TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
312                     NumPreNoops);
313   EmitInstruction(MI);
314   AdvanceCycle();
315 }
316 
317 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
318   IsHazardRecognizerMode = true;
319   CurrCycleInstr = MI;
320   unsigned W = PreEmitNoopsCommon(MI);
321   fixHazards(MI);
322   CurrCycleInstr = nullptr;
323   return W;
324 }
325 
326 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
327   if (MI->isBundle())
328     return 0;
329 
330   int WaitStates = 0;
331 
332   if (SIInstrInfo::isSMRD(*MI))
333     return std::max(WaitStates, checkSMRDHazards(MI));
334 
335   if (ST.hasNSAtoVMEMBug())
336     WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
337 
338   WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
339 
340   if (ST.hasNoDataDepHazard())
341     return WaitStates;
342 
343   if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
344     WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
345 
346   if (SIInstrInfo::isVALU(*MI))
347     WaitStates = std::max(WaitStates, checkVALUHazards(MI));
348 
349   if (SIInstrInfo::isDPP(*MI))
350     WaitStates = std::max(WaitStates, checkDPPHazards(MI));
351 
352   if (isDivFMas(MI->getOpcode()))
353     WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
354 
355   if (isRWLane(MI->getOpcode()))
356     WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
357 
358   if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
359        SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
360        SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
361     WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
362 
363   if (MI->isInlineAsm())
364     return std::max(WaitStates, checkInlineAsmHazards(MI));
365 
366   if (isSGetReg(MI->getOpcode()))
367     return std::max(WaitStates, checkGetRegHazards(MI));
368 
369   if (isSSetReg(MI->getOpcode()))
370     return std::max(WaitStates, checkSetRegHazards(MI));
371 
372   if (isRFE(MI->getOpcode()))
373     return std::max(WaitStates, checkRFEHazards(MI));
374 
375   if ((ST.hasReadM0MovRelInterpHazard() &&
376        (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
377         MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
378         MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
379       (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
380       (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
381       (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
382     return std::max(WaitStates, checkReadM0Hazards(MI));
383 
384   if (SIInstrInfo::isMAI(*MI))
385     return std::max(WaitStates, checkMAIHazards(MI));
386 
387   if (SIInstrInfo::isVMEM(*MI) ||
388       SIInstrInfo::isFLAT(*MI) ||
389       SIInstrInfo::isDS(*MI))
390     return std::max(WaitStates, checkMAILdStHazards(MI));
391 
392   return WaitStates;
393 }
394 
395 void GCNHazardRecognizer::EmitNoop() {
396   EmittedInstrs.push_front(nullptr);
397 }
398 
399 void GCNHazardRecognizer::AdvanceCycle() {
400   // When the scheduler detects a stall, it will call AdvanceCycle() without
401   // emitting any instructions.
402   if (!CurrCycleInstr) {
403     EmittedInstrs.push_front(nullptr);
404     return;
405   }
406 
407   if (CurrCycleInstr->isBundle()) {
408     processBundle();
409     return;
410   }
411 
412   unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
413   if (!NumWaitStates) {
414     CurrCycleInstr = nullptr;
415     return;
416   }
417 
418   // Keep track of emitted instructions
419   EmittedInstrs.push_front(CurrCycleInstr);
420 
421   // Add a nullptr for each additional wait state after the first.  Make sure
422   // not to add more than getMaxLookAhead() items to the list, since we
423   // truncate the list to that size right after this loop.
424   for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
425        i < e; ++i) {
426     EmittedInstrs.push_front(nullptr);
427   }
428 
429   // getMaxLookahead() is the largest number of wait states we will ever need
430   // to insert, so there is no point in keeping track of more than that many
431   // wait states.
432   EmittedInstrs.resize(getMaxLookAhead());
433 
434   CurrCycleInstr = nullptr;
435 }
436 
437 void GCNHazardRecognizer::RecedeCycle() {
438   llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
439 }
440 
441 //===----------------------------------------------------------------------===//
442 // Helper Functions
443 //===----------------------------------------------------------------------===//
444 
445 typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
446 
447 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
448 typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
449 
450 // Search for a hazard in a block and its predecessors.
451 template <typename StateT>
452 static bool
453 hasHazard(StateT State,
454           function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
455           function_ref<void(StateT &, const MachineInstr &)> UpdateState,
456           const MachineBasicBlock *MBB,
457           MachineBasicBlock::const_reverse_instr_iterator I,
458           DenseSet<const MachineBasicBlock *> &Visited) {
459   for (auto E = MBB->instr_rend(); I != E; ++I) {
460     // No need to look at parent BUNDLE instructions.
461     if (I->isBundle())
462       continue;
463 
464     switch (IsHazard(State, *I)) {
465     case HazardFound:
466       return true;
467     case HazardExpired:
468       return false;
469     default:
470       // Continue search
471       break;
472     }
473 
474     if (I->isInlineAsm() || I->isMetaInstruction())
475       continue;
476 
477     UpdateState(State, *I);
478   }
479 
480   for (MachineBasicBlock *Pred : MBB->predecessors()) {
481     if (!Visited.insert(Pred).second)
482       continue;
483 
484     if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
485                   Visited))
486       return true;
487   }
488 
489   return false;
490 }
491 
492 // Returns a minimum wait states since \p I walking all predecessors.
493 // Only scans until \p IsExpired does not return true.
494 // Can only be run in a hazard recognizer mode.
495 static int getWaitStatesSince(
496     GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
497     MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
498     IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
499     GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
500   for (auto E = MBB->instr_rend(); I != E; ++I) {
501     // Don't add WaitStates for parent BUNDLE instructions.
502     if (I->isBundle())
503       continue;
504 
505     if (IsHazard(*I))
506       return WaitStates;
507 
508     if (I->isInlineAsm())
509       continue;
510 
511     WaitStates += GetNumWaitStates(*I);
512 
513     if (IsExpired(*I, WaitStates))
514       return std::numeric_limits<int>::max();
515   }
516 
517   int MinWaitStates = std::numeric_limits<int>::max();
518   for (MachineBasicBlock *Pred : MBB->predecessors()) {
519     if (!Visited.insert(Pred).second)
520       continue;
521 
522     int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
523                                IsExpired, Visited, GetNumWaitStates);
524 
525     MinWaitStates = std::min(MinWaitStates, W);
526   }
527 
528   return MinWaitStates;
529 }
530 
531 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
532                               const MachineInstr *MI, IsExpiredFn IsExpired) {
533   DenseSet<const MachineBasicBlock *> Visited;
534   return getWaitStatesSince(IsHazard, MI->getParent(),
535                             std::next(MI->getReverseIterator()),
536                             0, IsExpired, Visited);
537 }
538 
539 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
540   if (IsHazardRecognizerMode) {
541     auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
542       return WaitStates >= Limit;
543     };
544     return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
545   }
546 
547   int WaitStates = 0;
548   for (MachineInstr *MI : EmittedInstrs) {
549     if (MI) {
550       if (IsHazard(*MI))
551         return WaitStates;
552 
553       if (MI->isInlineAsm())
554         continue;
555     }
556     ++WaitStates;
557 
558     if (WaitStates >= Limit)
559       break;
560   }
561   return std::numeric_limits<int>::max();
562 }
563 
564 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
565                                                IsHazardFn IsHazardDef,
566                                                int Limit) {
567   const SIRegisterInfo *TRI = ST.getRegisterInfo();
568 
569   auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
570     return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
571   };
572 
573   return getWaitStatesSince(IsHazardFn, Limit);
574 }
575 
576 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
577                                                   int Limit) {
578   auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
579     return isSSetReg(MI.getOpcode()) && IsHazard(MI);
580   };
581 
582   return getWaitStatesSince(IsHazardFn, Limit);
583 }
584 
585 //===----------------------------------------------------------------------===//
586 // No-op Hazard Detection
587 //===----------------------------------------------------------------------===//
588 
589 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
590                         MCRegister Reg) {
591   for (MCRegUnit Unit : TRI.regunits(Reg))
592     BV.set(Unit);
593 }
594 
595 static void addRegsToSet(const SIRegisterInfo &TRI,
596                          iterator_range<MachineInstr::const_mop_iterator> Ops,
597                          BitVector &DefSet, BitVector &UseSet) {
598   for (const MachineOperand &Op : Ops) {
599     if (Op.isReg())
600       addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
601   }
602 }
603 
604 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
605   addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
606 }
607 
608 static bool breaksSMEMSoftClause(MachineInstr *MI) {
609   return !SIInstrInfo::isSMRD(*MI);
610 }
611 
612 static bool breaksVMEMSoftClause(MachineInstr *MI) {
613   return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
614 }
615 
616 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
617   // SMEM soft clause are only present on VI+, and only matter if xnack is
618   // enabled.
619   if (!ST.isXNACKEnabled())
620     return 0;
621 
622   bool IsSMRD = TII.isSMRD(*MEM);
623 
624   resetClause();
625 
626   // A soft-clause is any group of consecutive SMEM instructions.  The
627   // instructions in this group may return out of order and/or may be
628   // replayed (i.e. the same instruction issued more than once).
629   //
630   // In order to handle these situations correctly we need to make sure that
631   // when a clause has more than one instruction, no instruction in the clause
632   // writes to a register that is read by another instruction in the clause
633   // (including itself). If we encounter this situation, we need to break the
634   // clause by inserting a non SMEM instruction.
635 
636   for (MachineInstr *MI : EmittedInstrs) {
637     // When we hit a non-SMEM instruction then we have passed the start of the
638     // clause and we can stop.
639     if (!MI)
640       break;
641 
642     if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
643       break;
644 
645     addClauseInst(*MI);
646   }
647 
648   if (ClauseDefs.none())
649     return 0;
650 
651   // We need to make sure not to put loads and stores in the same clause if they
652   // use the same address. For now, just start a new clause whenever we see a
653   // store.
654   if (MEM->mayStore())
655     return 1;
656 
657   addClauseInst(*MEM);
658 
659   // If the set of defs and uses intersect then we cannot add this instruction
660   // to the clause, so we have a hazard.
661   return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
662 }
663 
664 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
665   int WaitStatesNeeded = 0;
666 
667   WaitStatesNeeded = checkSoftClauseHazards(SMRD);
668 
669   // This SMRD hazard only affects SI.
670   if (!ST.hasSMRDReadVALUDefHazard())
671     return WaitStatesNeeded;
672 
673   // A read of an SGPR by SMRD instruction requires 4 wait states when the
674   // SGPR was written by a VALU instruction.
675   int SmrdSgprWaitStates = 4;
676   auto IsHazardDefFn = [this](const MachineInstr &MI) {
677     return TII.isVALU(MI);
678   };
679   auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
680     return TII.isSALU(MI);
681   };
682 
683   bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
684 
685   for (const MachineOperand &Use : SMRD->uses()) {
686     if (!Use.isReg())
687       continue;
688     int WaitStatesNeededForUse =
689         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
690                                                    SmrdSgprWaitStates);
691     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
692 
693     // This fixes what appears to be undocumented hardware behavior in SI where
694     // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
695     // needs some number of nops in between. We don't know how many we need, but
696     // let's use 4. This wasn't discovered before probably because the only
697     // case when this happens is when we expand a 64-bit pointer into a full
698     // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
699     // probably never encountered in the closed-source land.
700     if (IsBufferSMRD) {
701       int WaitStatesNeededForUse =
702         SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
703                                                    IsBufferHazardDefFn,
704                                                    SmrdSgprWaitStates);
705       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
706     }
707   }
708 
709   return WaitStatesNeeded;
710 }
711 
712 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
713   if (!ST.hasVMEMReadSGPRVALUDefHazard())
714     return 0;
715 
716   int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
717 
718   // A read of an SGPR by a VMEM instruction requires 5 wait states when the
719   // SGPR was written by a VALU Instruction.
720   const int VmemSgprWaitStates = 5;
721   auto IsHazardDefFn = [this](const MachineInstr &MI) {
722     return TII.isVALU(MI);
723   };
724   for (const MachineOperand &Use : VMEM->uses()) {
725     if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
726       continue;
727 
728     int WaitStatesNeededForUse =
729         VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
730                                                    VmemSgprWaitStates);
731     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
732   }
733   return WaitStatesNeeded;
734 }
735 
736 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
737   const SIRegisterInfo *TRI = ST.getRegisterInfo();
738   const SIInstrInfo *TII = ST.getInstrInfo();
739 
740   // Check for DPP VGPR read after VALU VGPR write and EXEC write.
741   int DppVgprWaitStates = 2;
742   int DppExecWaitStates = 5;
743   int WaitStatesNeeded = 0;
744   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
745     return TII->isVALU(MI);
746   };
747 
748   for (const MachineOperand &Use : DPP->uses()) {
749     if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
750       continue;
751     int WaitStatesNeededForUse =
752         DppVgprWaitStates - getWaitStatesSinceDef(
753                                 Use.getReg(),
754                                 [](const MachineInstr &) { return true; },
755                                 DppVgprWaitStates);
756     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
757   }
758 
759   WaitStatesNeeded = std::max(
760       WaitStatesNeeded,
761       DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
762                                                 DppExecWaitStates));
763 
764   return WaitStatesNeeded;
765 }
766 
767 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
768   const SIInstrInfo *TII = ST.getInstrInfo();
769 
770   // v_div_fmas requires 4 wait states after a write to vcc from a VALU
771   // instruction.
772   const int DivFMasWaitStates = 4;
773   auto IsHazardDefFn = [TII](const MachineInstr &MI) {
774     return TII->isVALU(MI);
775   };
776   int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
777                                                DivFMasWaitStates);
778 
779   return DivFMasWaitStates - WaitStatesNeeded;
780 }
781 
782 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
783   const SIInstrInfo *TII = ST.getInstrInfo();
784   unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
785 
786   const int GetRegWaitStates = 2;
787   auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
788     return GetRegHWReg == getHWReg(TII, MI);
789   };
790   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
791 
792   return GetRegWaitStates - WaitStatesNeeded;
793 }
794 
795 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
796   const SIInstrInfo *TII = ST.getInstrInfo();
797   unsigned HWReg = getHWReg(TII, *SetRegInstr);
798 
799   const int SetRegWaitStates = ST.getSetRegWaitStates();
800   auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
801     return HWReg == getHWReg(TII, MI);
802   };
803   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
804   return SetRegWaitStates - WaitStatesNeeded;
805 }
806 
807 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
808   if (!MI.mayStore())
809     return -1;
810 
811   const SIInstrInfo *TII = ST.getInstrInfo();
812   unsigned Opcode = MI.getOpcode();
813   const MCInstrDesc &Desc = MI.getDesc();
814 
815   int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
816   int VDataRCID = -1;
817   if (VDataIdx != -1)
818     VDataRCID = Desc.operands()[VDataIdx].RegClass;
819 
820   if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
821     // There is no hazard if the instruction does not use vector regs
822     // (like wbinvl1)
823     if (VDataIdx == -1)
824       return -1;
825     // For MUBUF/MTBUF instructions this hazard only exists if the
826     // instruction is not using a register in the soffset field.
827     const MachineOperand *SOffset =
828         TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
829     // If we have no soffset operand, then assume this field has been
830     // hardcoded to zero.
831     if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
832         (!SOffset || !SOffset->isReg()))
833       return VDataIdx;
834   }
835 
836   // MIMG instructions create a hazard if they don't use a 256-bit T# and
837   // the store size is greater than 8 bytes and they have more than two bits
838   // of their dmask set.
839   // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
840   if (TII->isMIMG(MI)) {
841     int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
842     assert(SRsrcIdx != -1 &&
843            AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
844     (void)SRsrcIdx;
845   }
846 
847   if (TII->isFLAT(MI)) {
848     int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
849     if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
850       return DataIdx;
851   }
852 
853   return -1;
854 }
855 
856 int
857 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
858                                             const MachineRegisterInfo &MRI) {
859   // Helper to check for the hazard where VMEM instructions that store more than
860   // 8 bytes can have there store data over written by the next instruction.
861   const SIRegisterInfo *TRI = ST.getRegisterInfo();
862 
863   const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
864   int WaitStatesNeeded = 0;
865 
866   if (!TRI->isVectorRegister(MRI, Def.getReg()))
867     return WaitStatesNeeded;
868   Register Reg = Def.getReg();
869   auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
870     int DataIdx = createsVALUHazard(MI);
871     return DataIdx >= 0 &&
872            TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
873   };
874   int WaitStatesNeededForDef =
875     VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
876   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
877 
878   return WaitStatesNeeded;
879 }
880 
881 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
882   int WaitStatesNeeded = 0;
883 
884   if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
885     const int TransDefWaitstates = 1;
886 
887     auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
888       if (!SIInstrInfo::isTRANS(MI))
889         return false;
890       const SIRegisterInfo *TRI = ST.getRegisterInfo();
891       const SIInstrInfo *TII = ST.getInstrInfo();
892       Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
893 
894       for (const MachineOperand &Use : VALU->explicit_uses()) {
895         if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
896           return true;
897       }
898 
899       return false;
900     };
901 
902     int WaitStatesNeededForDef =
903         TransDefWaitstates -
904         getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
905     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
906   }
907 
908   if (ST.hasDstSelForwardingHazard()) {
909     const int Shift16DefWaitstates = 1;
910 
911     auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
912       if (!SIInstrInfo::isVALU(MI))
913         return false;
914       const SIInstrInfo *TII = ST.getInstrInfo();
915       if (SIInstrInfo::isSDWA(MI)) {
916         if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
917           if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
918             return false;
919       } else {
920         if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
921             !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
922                   ->getImm() &
923               SISrcMods::DST_OP_SEL))
924           return false;
925       }
926       const SIRegisterInfo *TRI = ST.getRegisterInfo();
927       if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
928         Register Def = Dst->getReg();
929 
930         for (const MachineOperand &Use : VALU->explicit_uses()) {
931           if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
932             return true;
933         }
934       }
935 
936       return false;
937     };
938 
939     int WaitStatesNeededForDef =
940         Shift16DefWaitstates -
941         getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
942     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
943   }
944 
945   if (ST.hasVDecCoExecHazard()) {
946     const int VALUWriteSGPRVALUReadWaitstates = 2;
947     const int VALUWriteEXECRWLane = 4;
948     const int VALUWriteVGPRReadlaneRead = 1;
949 
950     const SIRegisterInfo *TRI = ST.getRegisterInfo();
951     const MachineRegisterInfo &MRI = MF.getRegInfo();
952     Register UseReg;
953     auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
954       if (!SIInstrInfo::isVALU(MI))
955         return false;
956       return MI.modifiesRegister(UseReg, TRI);
957     };
958 
959     for (const MachineOperand &Use : VALU->explicit_uses()) {
960       if (!Use.isReg())
961         continue;
962 
963       UseReg = Use.getReg();
964       if (TRI->isSGPRReg(MRI, UseReg)) {
965         int WaitStatesNeededForDef =
966             VALUWriteSGPRVALUReadWaitstates -
967             getWaitStatesSince(IsVALUDefSGPRFn,
968                                VALUWriteSGPRVALUReadWaitstates);
969         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
970       }
971     }
972 
973     if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
974       UseReg = AMDGPU::VCC;
975       int WaitStatesNeededForDef =
976           VALUWriteSGPRVALUReadWaitstates -
977           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
978       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
979     }
980 
981     switch (VALU->getOpcode()) {
982     case AMDGPU::V_READLANE_B32:
983     case AMDGPU::V_READFIRSTLANE_B32: {
984       MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
985       UseReg = Src->getReg();
986       int WaitStatesNeededForDef =
987           VALUWriteVGPRReadlaneRead -
988           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
989       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
990     }
991       [[fallthrough]];
992     case AMDGPU::V_WRITELANE_B32: {
993       UseReg = AMDGPU::EXEC;
994       int WaitStatesNeededForDef =
995           VALUWriteEXECRWLane -
996           getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
997       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
998       break;
999     }
1000     default:
1001       break;
1002     }
1003   }
1004 
1005   // This checks for the hazard where VMEM instructions that store more than
1006   // 8 bytes can have there store data over written by the next instruction.
1007   if (!ST.has12DWordStoreHazard())
1008     return WaitStatesNeeded;
1009 
1010   const MachineRegisterInfo &MRI = MF.getRegInfo();
1011 
1012   for (const MachineOperand &Def : VALU->defs()) {
1013     WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1014   }
1015 
1016   return WaitStatesNeeded;
1017 }
1018 
1019 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1020   // This checks for hazards associated with inline asm statements.
1021   // Since inline asms can contain just about anything, we use this
1022   // to call/leverage other check*Hazard routines. Note that
1023   // this function doesn't attempt to address all possible inline asm
1024   // hazards (good luck), but is a collection of what has been
1025   // problematic thus far.
1026 
1027   // see checkVALUHazards()
1028   if (!ST.has12DWordStoreHazard())
1029     return 0;
1030 
1031   const MachineRegisterInfo &MRI = MF.getRegInfo();
1032   int WaitStatesNeeded = 0;
1033 
1034   for (const MachineOperand &Op :
1035        llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
1036     if (Op.isReg() && Op.isDef()) {
1037       WaitStatesNeeded =
1038           std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1039     }
1040   }
1041 
1042   return WaitStatesNeeded;
1043 }
1044 
1045 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1046   const SIInstrInfo *TII = ST.getInstrInfo();
1047   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1048   const MachineRegisterInfo &MRI = MF.getRegInfo();
1049 
1050   const MachineOperand *LaneSelectOp =
1051       TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1052 
1053   if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1054     return 0;
1055 
1056   Register LaneSelectReg = LaneSelectOp->getReg();
1057   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1058 
1059   const int RWLaneWaitStates = 4;
1060   int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1061                                               RWLaneWaitStates);
1062   return RWLaneWaitStates - WaitStatesSince;
1063 }
1064 
1065 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1066   if (!ST.hasRFEHazards())
1067     return 0;
1068 
1069   const SIInstrInfo *TII = ST.getInstrInfo();
1070 
1071   const int RFEWaitStates = 1;
1072 
1073   auto IsHazardFn = [TII](const MachineInstr &MI) {
1074     return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1075   };
1076   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1077   return RFEWaitStates - WaitStatesNeeded;
1078 }
1079 
1080 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1081   const SIInstrInfo *TII = ST.getInstrInfo();
1082   const int ReadM0WaitStates = 1;
1083   auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1084   return ReadM0WaitStates -
1085          getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1086 }
1087 
1088 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1089   fixVMEMtoScalarWriteHazards(MI);
1090   fixVcmpxPermlaneHazards(MI);
1091   fixSMEMtoVectorWriteHazards(MI);
1092   fixVcmpxExecWARHazard(MI);
1093   fixLdsBranchVmemWARHazard(MI);
1094   if (ST.hasLdsDirect()) {
1095     fixLdsDirectVALUHazard(MI);
1096     fixLdsDirectVMEMHazard(MI);
1097   }
1098   fixVALUPartialForwardingHazard(MI);
1099   fixVALUTransUseHazard(MI);
1100   fixWMMAHazards(MI);
1101   fixShift64HighRegBug(MI);
1102   fixVALUMaskWriteHazard(MI);
1103 }
1104 
1105 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1106   if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1107     return false;
1108 
1109   const SIInstrInfo *TII = ST.getInstrInfo();
1110   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1111   auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1112     return (TII->isVOPC(MI) ||
1113             ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1114            MI.modifiesRegister(AMDGPU::EXEC, TRI);
1115   };
1116 
1117   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1118     unsigned Opc = MI.getOpcode();
1119     return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1120            Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1121   };
1122 
1123   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1124       std::numeric_limits<int>::max())
1125     return false;
1126 
1127   // V_NOP will be discarded by SQ.
1128   // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1129   // which is always a VGPR and available.
1130   auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1131   Register Reg = Src0->getReg();
1132   bool IsUndef = Src0->isUndef();
1133   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1134           TII->get(AMDGPU::V_MOV_B32_e32))
1135     .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1136     .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1137 
1138   return true;
1139 }
1140 
1141 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1142   if (!ST.hasVMEMtoScalarWriteHazard())
1143     return false;
1144 
1145   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1146     return false;
1147 
1148   if (MI->getNumDefs() == 0)
1149     return false;
1150 
1151   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1152 
1153   auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1154     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1155         !SIInstrInfo::isFLAT(I))
1156       return false;
1157 
1158     for (const MachineOperand &Def : MI->defs()) {
1159       const MachineOperand *Op =
1160           I.findRegisterUseOperand(Def.getReg(), false, TRI);
1161       if (!Op)
1162         continue;
1163       return true;
1164     }
1165     return false;
1166   };
1167 
1168   auto IsExpiredFn = [](const MachineInstr &MI, int) {
1169     return SIInstrInfo::isVALU(MI) ||
1170            (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1171             !MI.getOperand(0).getImm()) ||
1172            (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1173             AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
1174   };
1175 
1176   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1177       std::numeric_limits<int>::max())
1178     return false;
1179 
1180   const SIInstrInfo *TII = ST.getInstrInfo();
1181   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1182           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1183       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1184   return true;
1185 }
1186 
1187 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1188   if (!ST.hasSMEMtoVectorWriteHazard())
1189     return false;
1190 
1191   if (!SIInstrInfo::isVALU(*MI))
1192     return false;
1193 
1194   unsigned SDSTName;
1195   switch (MI->getOpcode()) {
1196   case AMDGPU::V_READLANE_B32:
1197   case AMDGPU::V_READFIRSTLANE_B32:
1198     SDSTName = AMDGPU::OpName::vdst;
1199     break;
1200   default:
1201     SDSTName = AMDGPU::OpName::sdst;
1202     break;
1203   }
1204 
1205   const SIInstrInfo *TII = ST.getInstrInfo();
1206   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1207   const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1208   const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1209   if (!SDST) {
1210     for (const auto &MO : MI->implicit_operands()) {
1211       if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1212         SDST = &MO;
1213         break;
1214       }
1215     }
1216   }
1217 
1218   if (!SDST)
1219     return false;
1220 
1221   const Register SDSTReg = SDST->getReg();
1222   auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1223     return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1224   };
1225 
1226   auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1227     if (TII->isSALU(MI)) {
1228       switch (MI.getOpcode()) {
1229       case AMDGPU::S_SETVSKIP:
1230       case AMDGPU::S_VERSION:
1231       case AMDGPU::S_WAITCNT_VSCNT:
1232       case AMDGPU::S_WAITCNT_VMCNT:
1233       case AMDGPU::S_WAITCNT_EXPCNT:
1234         // These instructions cannot not mitigate the hazard.
1235         return false;
1236       case AMDGPU::S_WAITCNT_LGKMCNT:
1237         // Reducing lgkmcnt count to 0 always mitigates the hazard.
1238         return (MI.getOperand(1).getImm() == 0) &&
1239                (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1240       case AMDGPU::S_WAITCNT: {
1241         const int64_t Imm = MI.getOperand(0).getImm();
1242         AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1243         return (Decoded.LgkmCnt == 0);
1244       }
1245       default:
1246         // SOPP instructions cannot mitigate the hazard.
1247         if (TII->isSOPP(MI))
1248           return false;
1249         // At this point the SALU can be assumed to mitigate the hazard
1250         // because either:
1251         // (a) it is independent of the at risk SMEM (breaking chain),
1252         // or
1253         // (b) it is dependent on the SMEM, in which case an appropriate
1254         //     s_waitcnt lgkmcnt _must_ exist between it and the at risk
1255         //     SMEM instruction.
1256         return true;
1257       }
1258     }
1259     return false;
1260   };
1261 
1262   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1263       std::numeric_limits<int>::max())
1264     return false;
1265 
1266   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1267           TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1268       .addImm(0);
1269   return true;
1270 }
1271 
1272 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1273   if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1274     return false;
1275 
1276   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1277   if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1278     return false;
1279 
1280   auto IsHazardFn = [TRI](const MachineInstr &I) {
1281     if (SIInstrInfo::isVALU(I))
1282       return false;
1283     return I.readsRegister(AMDGPU::EXEC, TRI);
1284   };
1285 
1286   const SIInstrInfo *TII = ST.getInstrInfo();
1287   auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1288     if (SIInstrInfo::isVALU(MI)) {
1289       if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1290         return true;
1291       for (auto MO : MI.implicit_operands())
1292         if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1293           return true;
1294     }
1295     if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1296         AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
1297       return true;
1298     return false;
1299   };
1300 
1301   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1302       std::numeric_limits<int>::max())
1303     return false;
1304 
1305   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1306           TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1307       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
1308   return true;
1309 }
1310 
1311 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1312                                                  const GCNSubtarget &ST) {
1313   if (!ST.hasLdsBranchVmemWARHazard())
1314     return false;
1315 
1316   // Check if the necessary condition for the hazard is met: both LDS and VMEM
1317   // instructions need to appear in the same function.
1318   bool HasLds = false;
1319   bool HasVmem = false;
1320   for (auto &MBB : MF) {
1321     for (auto &MI : MBB) {
1322       HasLds |= SIInstrInfo::isDS(MI);
1323       HasVmem |=
1324           SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1325       if (HasLds && HasVmem)
1326         return true;
1327     }
1328   }
1329   return false;
1330 }
1331 
1332 static bool isStoreCountWaitZero(const MachineInstr &I) {
1333   return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1334          I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1335          !I.getOperand(1).getImm();
1336 }
1337 
1338 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1339   if (!RunLdsBranchVmemWARHazardFixup)
1340     return false;
1341 
1342   assert(ST.hasLdsBranchVmemWARHazard());
1343 
1344   auto IsHazardInst = [](const MachineInstr &MI) {
1345     if (SIInstrInfo::isDS(MI))
1346       return 1;
1347     if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1348       return 2;
1349     return 0;
1350   };
1351 
1352   auto InstType = IsHazardInst(*MI);
1353   if (!InstType)
1354     return false;
1355 
1356   auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1357     return IsHazardInst(I) || isStoreCountWaitZero(I);
1358   };
1359 
1360   auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1361     if (!I.isBranch())
1362       return false;
1363 
1364     auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1365       auto InstType2 = IsHazardInst(I);
1366       return InstType2 && InstType != InstType2;
1367     };
1368 
1369     auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1370       auto InstType2 = IsHazardInst(I);
1371       if (InstType == InstType2)
1372         return true;
1373 
1374       return isStoreCountWaitZero(I);
1375     };
1376 
1377     return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1378            std::numeric_limits<int>::max();
1379   };
1380 
1381   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1382       std::numeric_limits<int>::max())
1383     return false;
1384 
1385   const SIInstrInfo *TII = ST.getInstrInfo();
1386   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1387           TII->get(AMDGPU::S_WAITCNT_VSCNT))
1388     .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1389     .addImm(0);
1390 
1391   return true;
1392 }
1393 
1394 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1395   if (!SIInstrInfo::isLDSDIR(*MI))
1396     return false;
1397 
1398   const int NoHazardWaitStates = 15;
1399   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1400   const Register VDSTReg = VDST->getReg();
1401 
1402   bool VisitedTrans = false;
1403   auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1404     if (!SIInstrInfo::isVALU(I))
1405       return false;
1406     VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1407     // Cover both WAR and WAW
1408     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1409   };
1410   auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1411     if (WaitStates >= NoHazardWaitStates)
1412       return true;
1413     // Instructions which cause va_vdst==0 expire hazard
1414     return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1415            SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1416   };
1417   auto GetWaitStatesFn = [](const MachineInstr &MI) {
1418     return SIInstrInfo::isVALU(MI) ? 1 : 0;
1419   };
1420 
1421   DenseSet<const MachineBasicBlock *> Visited;
1422   auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1423                                     std::next(MI->getReverseIterator()), 0,
1424                                     IsExpiredFn, Visited, GetWaitStatesFn);
1425 
1426   // Transcendentals can execute in parallel to other VALUs.
1427   // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1428   if (VisitedTrans)
1429     Count = 0;
1430 
1431   MachineOperand *WaitVdstOp =
1432       TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1433   WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1434 
1435   return true;
1436 }
1437 
1438 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1439   if (!SIInstrInfo::isLDSDIR(*MI))
1440     return false;
1441 
1442   const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1443   const Register VDSTReg = VDST->getReg();
1444 
1445   auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1446     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1447         !SIInstrInfo::isDS(I))
1448       return false;
1449     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1450   };
1451   auto IsExpiredFn = [](const MachineInstr &I, int) {
1452     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1453            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1454            (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1455             AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0);
1456   };
1457 
1458   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1459       std::numeric_limits<int>::max())
1460     return false;
1461 
1462   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1463           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1464       .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
1465 
1466   return true;
1467 }
1468 
1469 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1470   if (!ST.isWave64())
1471     return false;
1472   if (!ST.hasVALUPartialForwardingHazard())
1473     return false;
1474   if (!SIInstrInfo::isVALU(*MI))
1475     return false;
1476 
1477   SmallSetVector<Register, 4> SrcVGPRs;
1478 
1479   for (const MachineOperand &Use : MI->explicit_uses()) {
1480     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1481       SrcVGPRs.insert(Use.getReg());
1482   }
1483 
1484   // Only applies with >= 2 unique VGPR sources
1485   if (SrcVGPRs.size() <= 1)
1486     return false;
1487 
1488   // Look for the following pattern:
1489   //   Va <- VALU [PreExecPos]
1490   //   intv1
1491   //   Exec <- SALU [ExecPos]
1492   //   intv2
1493   //   Vb <- VALU [PostExecPos]
1494   //   intv3
1495   //   MI Va, Vb (WaitState = 0)
1496   //
1497   // Where:
1498   // intv1 + intv2 <= 2 VALUs
1499   // intv3 <= 4 VALUs
1500   //
1501   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1502 
1503   const int Intv1plus2MaxVALUs = 2;
1504   const int Intv3MaxVALUs = 4;
1505   const int IntvMaxVALUs = 6;
1506   const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1507 
1508   struct StateType {
1509     SmallDenseMap<Register, int, 4> DefPos;
1510     int ExecPos = std::numeric_limits<int>::max();
1511     int VALUs = 0;
1512   };
1513 
1514   StateType State;
1515 
1516   // This overloads expiry testing with all the hazard detection
1517   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1518     // Too many VALU states have passed
1519     if (State.VALUs > NoHazardVALUWaitStates)
1520       return HazardExpired;
1521 
1522     // Instructions which cause va_vdst==0 expire hazard
1523     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1524         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1525         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1526          AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
1527       return HazardExpired;
1528 
1529     // Track registers writes
1530     bool Changed = false;
1531     if (SIInstrInfo::isVALU(I)) {
1532       for (Register Src : SrcVGPRs) {
1533         if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1534           State.DefPos[Src] = State.VALUs;
1535           Changed = true;
1536         }
1537       }
1538     } else if (SIInstrInfo::isSALU(I)) {
1539       if (State.ExecPos == std::numeric_limits<int>::max()) {
1540         if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1541           State.ExecPos = State.VALUs;
1542           Changed = true;
1543         }
1544       }
1545     }
1546 
1547     // Early expiration: too many VALUs in intv3
1548     if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1549       return HazardExpired;
1550 
1551     // Only evaluate state if something changed
1552     if (!Changed)
1553       return NoHazardFound;
1554 
1555     // Determine positions of VALUs pre/post exec change
1556     if (State.ExecPos == std::numeric_limits<int>::max())
1557       return NoHazardFound;
1558 
1559     int PreExecPos = std::numeric_limits<int>::max();
1560     int PostExecPos = std::numeric_limits<int>::max();
1561 
1562     for (auto Entry : State.DefPos) {
1563       int DefVALUs = Entry.second;
1564       if (DefVALUs != std::numeric_limits<int>::max()) {
1565         if (DefVALUs >= State.ExecPos)
1566           PreExecPos = std::min(PreExecPos, DefVALUs);
1567         else if (DefVALUs < State.ExecPos)
1568           PostExecPos = std::min(PostExecPos, DefVALUs);
1569       }
1570     }
1571 
1572     // Need a VALUs post exec change
1573     if (PostExecPos == std::numeric_limits<int>::max())
1574       return NoHazardFound;
1575 
1576     // Too many VALUs in intv3?
1577     int Intv3VALUs = PostExecPos;
1578     if (Intv3VALUs > Intv3MaxVALUs)
1579       return HazardExpired;
1580 
1581     // Too many VALUs in intv2?
1582     int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1583     if (Intv2VALUs > Intv1plus2MaxVALUs)
1584       return HazardExpired;
1585 
1586     // Need a VALUs pre exec change
1587     if (PreExecPos == std::numeric_limits<int>::max())
1588       return NoHazardFound;
1589 
1590     // Too many VALUs in intv1?
1591     int Intv1VALUs = PreExecPos - State.ExecPos;
1592     if (Intv1VALUs > Intv1plus2MaxVALUs)
1593       return HazardExpired;
1594 
1595     // Too many VALUs in intv1 + intv2
1596     if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1597       return HazardExpired;
1598 
1599     return HazardFound;
1600   };
1601   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1602     if (SIInstrInfo::isVALU(MI))
1603       State.VALUs += 1;
1604   };
1605 
1606   DenseSet<const MachineBasicBlock *> Visited;
1607   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1608                             std::next(MI->getReverseIterator()), Visited))
1609     return false;
1610 
1611   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1612           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1613       .addImm(0x0fff);
1614 
1615   return true;
1616 }
1617 
1618 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1619   if (!ST.hasVALUTransUseHazard())
1620     return false;
1621   if (!SIInstrInfo::isVALU(*MI))
1622     return false;
1623 
1624   SmallSet<Register, 4> SrcVGPRs;
1625 
1626   for (const MachineOperand &Use : MI->explicit_uses()) {
1627     if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1628       SrcVGPRs.insert(Use.getReg());
1629   }
1630 
1631   // Look for the following pattern:
1632   //   Va <- TRANS VALU
1633   //   intv
1634   //   MI Va (WaitState = 0)
1635   //
1636   // Where:
1637   // intv <= 5 VALUs / 1 TRANS
1638   //
1639   // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1640 
1641   const int IntvMaxVALUs = 5;
1642   const int IntvMaxTRANS = 1;
1643 
1644   struct StateType {
1645     int VALUs = 0;
1646     int TRANS = 0;
1647   };
1648 
1649   StateType State;
1650 
1651   // This overloads expiry testing with all the hazard detection
1652   auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1653     // Too many VALU states have passed
1654     if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1655       return HazardExpired;
1656 
1657     // Instructions which cause va_vdst==0 expire hazard
1658     if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1659         SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1660         (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1661          I.getOperand(0).getImm() == 0x0fff))
1662       return HazardExpired;
1663 
1664     // Track registers writes
1665     if (SIInstrInfo::isTRANS(I)) {
1666       for (Register Src : SrcVGPRs) {
1667         if (I.modifiesRegister(Src, &TRI)) {
1668           return HazardFound;
1669         }
1670       }
1671     }
1672 
1673     return NoHazardFound;
1674   };
1675   auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1676     if (SIInstrInfo::isVALU(MI))
1677       State.VALUs += 1;
1678     if (SIInstrInfo::isTRANS(MI))
1679       State.TRANS += 1;
1680   };
1681 
1682   DenseSet<const MachineBasicBlock *> Visited;
1683   if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1684                             std::next(MI->getReverseIterator()), Visited))
1685     return false;
1686 
1687   // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1688   // avoided.
1689   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1690           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1691       .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
1692 
1693   return true;
1694 }
1695 
1696 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1697   if (!SIInstrInfo::isWMMA(*MI))
1698     return false;
1699 
1700   const SIInstrInfo *TII = ST.getInstrInfo();
1701   const SIRegisterInfo *TRI = ST.getRegisterInfo();
1702 
1703   auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1704     if (!SIInstrInfo::isWMMA(I))
1705       return false;
1706 
1707     // Src0 or Src1 of the current wmma instruction overlaps with the dest of
1708     // the previous wmma.
1709     const Register CurSrc0Reg =
1710         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1711     const Register CurSrc1Reg =
1712         TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1713 
1714     const Register PrevDstReg =
1715         TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1716 
1717     if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1718         TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1719       return true;
1720     }
1721 
1722     // Src2 of the current wmma instruction overlaps with the dest of the
1723     // previous wmma.
1724     const MachineOperand *Src2 =
1725         TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1726     const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
1727 
1728     if (CurSrc2Reg != AMDGPU::NoRegister &&
1729         TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1730 
1731       const MachineOperand *Src2Mods =
1732           TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
1733       const bool NoSrc2Mods =
1734           (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
1735       // Exception: there is no hazard if the wmma instructions are of the same
1736       // type and there is no input modifier on src2 of the current instruction.
1737       return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
1738                               TII->pseudoToMCOpcode(MI->getOpcode())));
1739     }
1740 
1741     return false;
1742   };
1743 
1744   auto IsExpiredFn = [](const MachineInstr &I, int) {
1745     return SIInstrInfo::isVALU(I);
1746   };
1747 
1748   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1749       std::numeric_limits<int>::max())
1750     return false;
1751 
1752   BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1753 
1754   return true;
1755 }
1756 
1757 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1758   if (!ST.hasShift64HighRegBug())
1759     return false;
1760 
1761   switch (MI->getOpcode()) {
1762   default:
1763     return false;
1764   case AMDGPU::V_LSHLREV_B64_e64:
1765   case AMDGPU::V_LSHRREV_B64_e64:
1766   case AMDGPU::V_ASHRREV_I64_e64:
1767     break;
1768   }
1769 
1770   MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1771   if (!Amt->isReg())
1772     return false;
1773 
1774   Register AmtReg = Amt->getReg();
1775   const MachineRegisterInfo &MRI = MF.getRegInfo();
1776   // Check if this is a last VGPR in the allocation block.
1777   if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1778     return false;
1779 
1780   if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1781     return false;
1782 
1783   MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1784   bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1785   bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1786   bool Overlapped = OverlappedSrc || OverlappedDst;
1787 
1788   assert(!OverlappedDst || !OverlappedSrc ||
1789          Src1->getReg() == MI->getOperand(0).getReg());
1790   assert(ST.needsAlignedVGPRs());
1791   static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1792 
1793   Register NewReg;
1794   for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1795                                    : AMDGPU::VGPR_32RegClass) {
1796     if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1797       NewReg = Reg;
1798       break;
1799     }
1800   }
1801 
1802   Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1803                                : NewReg;
1804   Register NewAmtLo;
1805 
1806   if (Overlapped)
1807     NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1808 
1809   DebugLoc DL = MI->getDebugLoc();
1810   MachineBasicBlock *MBB = MI->getParent();
1811   // Insert a full wait count because found register might be pending a wait.
1812   BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1813       .addImm(0);
1814 
1815   // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1816   if (Overlapped)
1817     runOnInstruction(
1818         BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1819             .addDef(AmtReg - 1)
1820             .addReg(AmtReg - 1, RegState::Undef)
1821             .addReg(NewAmtLo, RegState::Undef));
1822   runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1823                        .addDef(AmtReg)
1824                        .addReg(AmtReg, RegState::Undef)
1825                        .addReg(NewAmt, RegState::Undef));
1826 
1827   // Instructions emitted after the current instruction will be processed by the
1828   // parent loop of the hazard recognizer in a natural way.
1829   BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1830           AmtReg)
1831       .addDef(NewAmt)
1832       .addReg(NewAmt)
1833       .addReg(AmtReg);
1834   if (Overlapped)
1835     BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1836             AmtReg - 1)
1837         .addDef(NewAmtLo)
1838         .addReg(NewAmtLo)
1839         .addReg(AmtReg - 1);
1840 
1841   // Re-running hazard recognizer on the modified instruction is not necessary,
1842   // inserted V_SWAP_B32 has already both read and write new registers so
1843   // hazards related to these register has already been handled.
1844   Amt->setReg(NewAmt);
1845   Amt->setIsKill(false);
1846   // We do not update liveness, so verifier may see it as undef.
1847   Amt->setIsUndef();
1848   if (OverlappedDst)
1849     MI->getOperand(0).setReg(NewReg);
1850   if (OverlappedSrc) {
1851     Src1->setReg(NewReg);
1852     Src1->setIsKill(false);
1853     Src1->setIsUndef();
1854   }
1855 
1856   return true;
1857 }
1858 
1859 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1860   int NSAtoVMEMWaitStates = 1;
1861 
1862   if (!ST.hasNSAtoVMEMBug())
1863     return 0;
1864 
1865   if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1866     return 0;
1867 
1868   const SIInstrInfo *TII = ST.getInstrInfo();
1869   const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1870   if (!Offset || (Offset->getImm() & 6) == 0)
1871     return 0;
1872 
1873   auto IsHazardFn = [TII](const MachineInstr &I) {
1874     if (!SIInstrInfo::isMIMG(I))
1875       return false;
1876     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1877     return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1878            TII->getInstSizeInBytes(I) >= 16;
1879   };
1880 
1881   return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1882 }
1883 
1884 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1885   int FPAtomicToDenormModeWaitStates = 3;
1886 
1887   if (!ST.hasFPAtomicToDenormModeHazard())
1888     return 0;
1889 
1890   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1891     return 0;
1892 
1893   auto IsHazardFn = [](const MachineInstr &I) {
1894     if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1895       return false;
1896     return SIInstrInfo::isFPAtomic(I);
1897   };
1898 
1899   auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1900     if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1901       return true;
1902 
1903     switch (MI.getOpcode()) {
1904     case AMDGPU::S_WAITCNT:
1905     case AMDGPU::S_WAITCNT_VSCNT:
1906     case AMDGPU::S_WAITCNT_VMCNT:
1907     case AMDGPU::S_WAITCNT_EXPCNT:
1908     case AMDGPU::S_WAITCNT_LGKMCNT:
1909     case AMDGPU::S_WAIT_IDLE:
1910       return true;
1911     default:
1912       break;
1913     }
1914 
1915     return false;
1916   };
1917 
1918   return FPAtomicToDenormModeWaitStates -
1919          ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1920 }
1921 
1922 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1923   assert(SIInstrInfo::isMAI(*MI));
1924 
1925   return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1926 }
1927 
1928 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1929   // Early exit if no padding is requested.
1930   if (MFMAPaddingRatio == 0)
1931     return 0;
1932 
1933   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1934   if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1935     return 0;
1936 
1937   int NeighborMFMALatency = 0;
1938   auto IsNeighboringMFMA = [&NeighborMFMALatency,
1939                             this](const MachineInstr &MI) {
1940     if (!SIInstrInfo::isMFMA(MI))
1941       return false;
1942 
1943     NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1944     return true;
1945   };
1946 
1947   const int MaxMFMAPipelineWaitStates = 16;
1948   int WaitStatesSinceNeighborMFMA =
1949       getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1950 
1951   int NeighborMFMAPaddingNeeded =
1952       (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1953       WaitStatesSinceNeighborMFMA;
1954 
1955   return std::max(0, NeighborMFMAPaddingNeeded);
1956 }
1957 
1958 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1959   int WaitStatesNeeded = 0;
1960   unsigned Opc = MI->getOpcode();
1961 
1962   auto IsVALUFn = [](const MachineInstr &MI) {
1963     return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1964   };
1965 
1966   if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1967     const int LegacyVALUWritesVGPRWaitStates = 2;
1968     const int VALUWritesExecWaitStates = 4;
1969     const int MaxWaitStates = 4;
1970 
1971     int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1972       getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1973     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1974 
1975     if (WaitStatesNeeded < MaxWaitStates) {
1976       for (const MachineOperand &Use : MI->explicit_uses()) {
1977         const int MaxWaitStates = 2;
1978 
1979         if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1980           continue;
1981 
1982         int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1983           getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1984         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1985 
1986         if (WaitStatesNeeded == MaxWaitStates)
1987           break;
1988       }
1989     }
1990   }
1991 
1992   for (const MachineOperand &Op : MI->explicit_operands()) {
1993     if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1994       continue;
1995 
1996     if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1997       continue;
1998 
1999     const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2000     const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2001     const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2002     const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2003     const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2004     const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2005     const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2006     const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2007     const int MaxWaitStates = 18;
2008     Register Reg = Op.getReg();
2009     unsigned HazardDefLatency = 0;
2010 
2011     auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2012                                this](const MachineInstr &MI) {
2013       if (!SIInstrInfo::isMFMA(MI))
2014         return false;
2015       Register DstReg = MI.getOperand(0).getReg();
2016       if (DstReg == Reg)
2017         return false;
2018       HazardDefLatency =
2019           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2020       return TRI.regsOverlap(DstReg, Reg);
2021     };
2022 
2023     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2024                                                    MaxWaitStates);
2025     int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2026     int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2027     int OpNo = Op.getOperandNo();
2028     if (OpNo == SrcCIdx) {
2029       NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2030     } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2031       switch (HazardDefLatency) {
2032       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2033                break;
2034       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2035                break;
2036       case 16: [[fallthrough]];
2037       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2038                break;
2039       }
2040     } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2041       switch (HazardDefLatency) {
2042       case 2:  NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2043                break;
2044       case 8:  NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2045                break;
2046       case 16: [[fallthrough]];
2047       default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2048                break;
2049       }
2050     }
2051 
2052     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2053     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2054 
2055     if (WaitStatesNeeded == MaxWaitStates)
2056       return WaitStatesNeeded; // Early exit.
2057 
2058     auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2059       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2060         return false;
2061       Register DstReg = MI.getOperand(0).getReg();
2062       return TRI.regsOverlap(Reg, DstReg);
2063     };
2064 
2065     const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2066     const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2067     const int AccVGPRWriteAccVgprReadWaitStates = 3;
2068     NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2069     if (OpNo == SrcCIdx)
2070       NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2071     else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2072       NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2073 
2074     WaitStatesNeededForUse = NeedWaitStates -
2075       getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2076     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2077 
2078     if (WaitStatesNeeded == MaxWaitStates)
2079       return WaitStatesNeeded; // Early exit.
2080   }
2081 
2082   if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2083     const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2084     const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2085     const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2086     const int MaxWaitStates = 13;
2087     Register DstReg = MI->getOperand(0).getReg();
2088     unsigned HazardDefLatency = 0;
2089 
2090     auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2091                          this](const MachineInstr &MI) {
2092       if (!SIInstrInfo::isMFMA(MI))
2093         return false;
2094       Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2095       HazardDefLatency =
2096           std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2097       return TRI.regsOverlap(Reg, DstReg);
2098     };
2099 
2100     int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2101     int NeedWaitStates;
2102     switch (HazardDefLatency) {
2103     case 2:  NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2104              break;
2105     case 8:  NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2106              break;
2107     case 16: [[fallthrough]];
2108     default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2109              break;
2110     }
2111 
2112     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2113     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2114   }
2115 
2116   // Pad neighboring MFMA with noops for better inter-wave performance.
2117   WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2118 
2119   return WaitStatesNeeded;
2120 }
2121 
2122 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2123   int WaitStatesNeeded = 0;
2124   unsigned Opc = MI->getOpcode();
2125 
2126   auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2127     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2128   };
2129 
2130   auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2131     return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2132            !SIInstrInfo::isDOT(MI);
2133   };
2134 
2135   if (!SIInstrInfo::isMFMA(*MI))
2136     return WaitStatesNeeded;
2137 
2138   const int VALUWritesExecWaitStates = 4;
2139   int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2140     getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2141                           VALUWritesExecWaitStates);
2142   WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2143 
2144   int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2145 
2146   // Loop for both DGEMM and S/HGEMM 2nd instruction.
2147   for (const MachineOperand &Use : MI->explicit_uses()) {
2148     const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2149     const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2150     const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2151     const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2152     const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2153     const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2154     const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2155     const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2156     const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2157     const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2158     const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2159     const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2160     const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2161     const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2162     const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2163     const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2164     const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2165     const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2166     const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2167     const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2168     const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2169     const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2170     const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2171     const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2172     const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2173     const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2174     const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2175     const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2176     const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2177     const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2178     const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2179     const int MaxWaitStates = 19;
2180 
2181     if (!Use.isReg())
2182       continue;
2183     Register Reg = Use.getReg();
2184     bool FullReg;
2185     const MachineInstr *MI1;
2186 
2187     auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2188                                this](const MachineInstr &MI) {
2189       if (!SIInstrInfo::isMFMA(MI))
2190         return false;
2191       Register DstReg = MI.getOperand(0).getReg();
2192       FullReg = (DstReg == Reg);
2193       MI1 = &MI;
2194       return TRI.regsOverlap(DstReg, Reg);
2195     };
2196 
2197     WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2198       getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2199     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2200 
2201     int NumWaitStates =
2202         getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2203     if (NumWaitStates == std::numeric_limits<int>::max())
2204       continue;
2205 
2206     int OpNo = Use.getOperandNo();
2207     unsigned Opc1 = MI1->getOpcode();
2208     int NeedWaitStates = 0;
2209     if (OpNo == SrcCIdx) {
2210       if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2211         NeedWaitStates = 0;
2212       } else if (FullReg) {
2213         if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2214              Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2215             (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2216              Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2217           NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2218         else if (ST.hasGFX940Insts() &&
2219                  TSchedModel.computeInstrLatency(MI1) == 2)
2220           NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2221       } else {
2222         switch (Opc1) {
2223         case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2224         case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2225         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2226         case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2227           if (!isXDL(ST, *MI))
2228             NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2229           break;
2230         case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2231         case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2232           if (!isXDL(ST, *MI))
2233             NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2234           break;
2235         default:
2236           if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2237             break;
2238           switch (TSchedModel.computeInstrLatency(MI1)) {
2239           case 2:
2240             NeedWaitStates = ST.hasGFX940Insts()
2241               ? isXDL(ST, *MI1)
2242                 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2243                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2244               : isDGEMM(Opc)
2245                 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2246                 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2247             break;
2248           case 4:
2249             assert(ST.hasGFX940Insts());
2250             NeedWaitStates = isXDL(ST, *MI1)
2251               ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2252               : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2253             break;
2254           case 8:
2255             NeedWaitStates = ST.hasGFX940Insts()
2256               ? isXDL(ST, *MI1)
2257                 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2258                 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2259               : isDGEMM(Opc)
2260                 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2261                 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2262             break;
2263           case 16: [[fallthrough]];
2264           default:
2265             NeedWaitStates = ST.hasGFX940Insts()
2266               ? isXDL(ST, *MI1)
2267                 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2268                 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2269               : isDGEMM(Opc)
2270                 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2271                 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2272           }
2273         }
2274       }
2275     } else {
2276       switch (Opc1) {
2277       case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2278       case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2279       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2280       case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2281         NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2282         break;
2283       case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2284       case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2285         NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2286         break;
2287       default:
2288         switch (TSchedModel.computeInstrLatency(MI1)) {
2289         case 2:
2290           NeedWaitStates = ST.hasGFX940Insts()
2291             ? isXDL(ST, *MI1)
2292               ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2293               : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2294             : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2295           break;
2296         case 4:
2297           assert(ST.hasGFX940Insts());
2298           NeedWaitStates = isXDL(ST, *MI1)
2299             ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2300             : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2301           break;
2302         case 8:
2303           NeedWaitStates = ST.hasGFX940Insts()
2304             ? isXDL(ST, *MI1)
2305               ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2306               : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2307             : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2308           break;
2309         case 16: [[fallthrough]];
2310         default:
2311           NeedWaitStates = ST.hasGFX940Insts()
2312             ? isXDL(ST, *MI1)
2313               ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2314               : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2315             : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2316         }
2317       }
2318     }
2319     if (WaitStatesNeeded >= NeedWaitStates)
2320       continue;
2321 
2322     WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2323     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2324 
2325     if (WaitStatesNeeded == MaxWaitStates)
2326       break;
2327   }
2328 
2329   return WaitStatesNeeded;
2330 }
2331 
2332 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2333   // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2334   if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2335     return 0;
2336 
2337   int WaitStatesNeeded = 0;
2338 
2339   auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2340     return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2341   };
2342 
2343   for (const MachineOperand &Op : MI->explicit_uses()) {
2344     if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2345       continue;
2346 
2347     Register Reg = Op.getReg();
2348 
2349     const int AccVgprReadLdStWaitStates = 2;
2350     const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2351     const int MaxWaitStates = 2;
2352 
2353     int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2354       getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2355     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2356 
2357     if (WaitStatesNeeded == MaxWaitStates)
2358       return WaitStatesNeeded; // Early exit.
2359 
2360     auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2361       if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2362           MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2363         return false;
2364       auto IsVALUFn = [](const MachineInstr &MI) {
2365         return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2366       };
2367       return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2368              std::numeric_limits<int>::max();
2369     };
2370 
2371     WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2372       getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2373     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2374   }
2375 
2376   return WaitStatesNeeded;
2377 }
2378 
2379 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2380   if (!ST.hasGFX90AInsts())
2381     return 0;
2382 
2383   auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2384     return isDGEMM(MI.getOpcode());
2385   };
2386 
2387   // This is checked in checkMAIHazards90A()
2388   if (SIInstrInfo::isMFMA(*MI))
2389     return 0;
2390 
2391   const MachineRegisterInfo &MRI = MF.getRegInfo();
2392 
2393   int WaitStatesNeeded = 0;
2394 
2395   bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2396                SIInstrInfo::isFLAT(*MI) ||
2397                SIInstrInfo::isDS(*MI);
2398   bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2399   bool IsVALU = SIInstrInfo::isVALU(*MI);
2400 
2401   const MachineInstr *MFMA = nullptr;
2402   unsigned Reg;
2403   auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2404     if (!SIInstrInfo::isMFMA(MI) ||
2405         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2406       return false;
2407     MFMA = &MI;
2408     return true;
2409   };
2410 
2411   const MachineInstr *DOT = nullptr;
2412   auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2413     if (!SIInstrInfo::isDOT(MI) ||
2414         !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2415       return false;
2416     DOT = &MI;
2417     return true;
2418   };
2419 
2420   bool DGEMMAfterVALUWrite = false;
2421   auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2422     // Found DGEMM on reverse traversal to def.
2423     if (isDGEMM(MI.getOpcode()))
2424       DGEMMAfterVALUWrite = true;
2425 
2426     // Only hazard if register is defined by a VALU and a DGEMM is found after
2427     // after the def.
2428     if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2429       return false;
2430 
2431     return true;
2432   };
2433 
2434   int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2435                                            AMDGPU::OpName::src2);
2436 
2437   if (IsMemOrExport || IsVALU) {
2438     const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2439     const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2440     const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2441     const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2442     const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2443     const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2444     const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2445     const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2446     const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2447     const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2448     const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2449     const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2450     const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2451     const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2452     const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2453     const int DotWriteSameDotReadSrcAB = 3;
2454     const int DotWriteDifferentVALURead = 3;
2455     const int DMFMABetweenVALUWriteVMEMRead = 2;
2456     const int MaxWaitStates = 19;
2457 
2458     for (const MachineOperand &Use : MI->explicit_uses()) {
2459       if (!Use.isReg())
2460         continue;
2461       Reg = Use.getReg();
2462 
2463       DOT = nullptr;
2464       int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2465                                                      MaxWaitStates);
2466       if (DOT) {
2467         int NeedWaitStates = 0;
2468         if (DOT->getOpcode() == MI->getOpcode()) {
2469           if (&Use - &MI->getOperand(0) != SrcCIdx)
2470             NeedWaitStates = DotWriteSameDotReadSrcAB;
2471         } else {
2472           NeedWaitStates = DotWriteDifferentVALURead;
2473         }
2474 
2475         int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2476         WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2477       }
2478 
2479       // Workaround for HW data hazard bug observed only in GFX90A. When there
2480       // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2481       // causes the SQ to incorrectly not insert two wait states between the two
2482       // instructions needed to avoid data hazard.
2483       if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2484         DGEMMAfterVALUWrite = false;
2485         if (TRI.isVectorRegister(MRI, Reg)) {
2486           int WaitStatesNeededForUse =
2487                 DMFMABetweenVALUWriteVMEMRead -
2488                 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2489                                       DMFMABetweenVALUWriteVMEMRead);
2490 
2491           WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2492         }
2493       }
2494 
2495       MFMA = nullptr;
2496       WaitStatesSinceDef =
2497           getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2498       if (!MFMA)
2499         continue;
2500 
2501       unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2502       int NeedWaitStates = MaxWaitStates;
2503       switch (HazardDefLatency) {
2504       case 2:
2505         NeedWaitStates =
2506           ST.hasGFX940Insts()
2507             ? isXDL(ST, *MFMA)
2508               ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2509               : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2510             : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2511         break;
2512       case 4:
2513         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2514         NeedWaitStates =
2515           isDGEMM(MFMA->getOpcode())
2516             ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2517                             : DMFMA4x4WriteVgprVALUReadWaitStates
2518             : isXDL(ST, *MFMA)
2519               ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2520               : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2521         break;
2522       case 8:
2523         NeedWaitStates =
2524           ST.hasGFX940Insts()
2525             ? isXDL(ST, *MFMA)
2526               ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2527               : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2528             : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2529         break;
2530       case 16: [[fallthrough]];
2531       default:
2532         NeedWaitStates =
2533           isDGEMM(MFMA->getOpcode())
2534             ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2535                             : DMFMA16x16WriteVgprVALUReadWaitStates
2536             : ST.hasGFX940Insts()
2537               ? isXDL(ST, *MFMA)
2538                 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2539                 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2540               : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2541         break;
2542       }
2543 
2544       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2545       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2546 
2547       if (WaitStatesNeeded == MaxWaitStates)
2548         break;
2549     }
2550   }
2551 
2552   unsigned Opc = MI->getOpcode();
2553   const int DMFMAToFMA64WaitStates = 2;
2554   if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2555        Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2556        Opc == AMDGPU::V_FMAC_F64_dpp) &&
2557       WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2558     int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2559       getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2560     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2561   }
2562 
2563   if (!IsVALU && !IsMemOrExport)
2564     return WaitStatesNeeded;
2565 
2566   for (const MachineOperand &Def : MI->defs()) {
2567     const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2568     const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2569     const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2570     const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2571     const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2572     const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2573     const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2574     const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2575     const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2576     const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2577     const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2578     const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2579     const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2580     const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2581     const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2582     const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2583     const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2584     const int DotWriteDifferentVALUWrite = 3;
2585     const int MaxWaitStates = 19;
2586     const int MaxWarWaitStates = 15;
2587 
2588     Reg = Def.getReg();
2589 
2590     DOT = nullptr;
2591     int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2592                                                    MaxWaitStates);
2593     if (DOT && DOT->getOpcode() != MI->getOpcode())
2594       WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2595                                                     WaitStatesSinceDef);
2596 
2597     MFMA = nullptr;
2598     WaitStatesSinceDef =
2599         getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2600     if (MFMA) {
2601       int NeedWaitStates = MaxWaitStates;
2602       switch (TSchedModel.computeInstrLatency(MFMA)) {
2603       case 2:
2604         NeedWaitStates = ST.hasGFX940Insts()
2605           ? isXDL(ST, *MFMA)
2606             ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2607             : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2608           : SMFMA4x4WriteVgprVALUWawWaitStates;
2609         break;
2610       case 4:
2611         assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2612         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2613             ? DMFMA4x4WriteVgprVALUWriteWaitStates
2614             : isXDL(ST, *MFMA)
2615               ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2616               : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2617         break;
2618       case 8:
2619         NeedWaitStates = ST.hasGFX940Insts()
2620           ? isXDL(ST, *MFMA)
2621             ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2622             : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2623           : SMFMA16x16WriteVgprVALUWawWaitStates;
2624         break;
2625       case 16: [[fallthrough]];
2626       default:
2627         NeedWaitStates = isDGEMM(MFMA->getOpcode())
2628                    ? DMFMA16x16WriteVgprVALUWriteWaitStates
2629                    : ST.hasGFX940Insts()
2630                      ? isXDL(ST, *MFMA)
2631                        ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2632                        : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2633                    : SMFMA32x32WriteVgprVALUWawWaitStates;
2634         break;
2635       }
2636 
2637       int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2638       WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2639 
2640       if (WaitStatesNeeded == MaxWaitStates)
2641         break;
2642     }
2643 
2644     auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2645       if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2646           !MI.readsRegister(Reg, &TRI))
2647         return false;
2648 
2649       if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2650         return false;
2651 
2652       const MachineOperand *SrcC =
2653           TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2654       assert(SrcC);
2655       if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2656         return false;
2657 
2658       MFMA = &MI;
2659       return true;
2660     };
2661 
2662     MFMA = nullptr;
2663     int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2664                                                 MaxWarWaitStates);
2665     if (!MFMA)
2666       continue;
2667 
2668     unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2669     int NeedWaitStates = MaxWaitStates;
2670     switch (HazardDefLatency) {
2671     case 2:  NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2672              break;
2673     case 4:  assert(ST.hasGFX940Insts());
2674              NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2675              break;
2676     case 8:  NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2677              break;
2678     case 16: [[fallthrough]];
2679     default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2680              break;
2681     }
2682 
2683     int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2684     WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2685   }
2686 
2687   return WaitStatesNeeded;
2688 }
2689 
2690 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2691   if (!SU->isInstr())
2692     return false;
2693 
2694   const MachineInstr *MAI = nullptr;
2695 
2696   auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2697     MAI = nullptr;
2698     if (SIInstrInfo::isMFMA(MI))
2699       MAI = &MI;
2700     return MAI != nullptr;
2701   };
2702 
2703   MachineInstr *MI = SU->getInstr();
2704   if (IsMFMAFn(*MI)) {
2705     int W = getWaitStatesSince(IsMFMAFn, 16);
2706     if (MAI)
2707       return W < (int)TSchedModel.computeInstrLatency(MAI);
2708   }
2709 
2710   return false;
2711 }
2712 
2713 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2714   if (!ST.isWave64())
2715     return false;
2716   if (!ST.hasVALUMaskWriteHazard())
2717     return false;
2718   if (!SIInstrInfo::isSALU(*MI))
2719     return false;
2720 
2721   // The hazard sequence is three instructions:
2722   //   1. VALU reads SGPR as mask
2723   //   2. SALU writes SGPR
2724   //   3. SALU reads SGPR
2725   // The hazard can expire if the distance between 2 and 3 is sufficient.
2726   // In practice this happens <10% of the time, hence this always assumes
2727   // the hazard exists if 1 and 2 are present to avoid searching.
2728 
2729   const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2730   if (!SDSTOp || !SDSTOp->isReg())
2731     return false;
2732 
2733   const Register HazardReg = SDSTOp->getReg();
2734   if (HazardReg == AMDGPU::EXEC ||
2735       HazardReg == AMDGPU::EXEC_LO ||
2736       HazardReg == AMDGPU::EXEC_HI ||
2737       HazardReg == AMDGPU::M0)
2738     return false;
2739 
2740   auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2741     switch (I.getOpcode()) {
2742     case AMDGPU::V_ADDC_U32_e32:
2743     case AMDGPU::V_ADDC_U32_dpp:
2744     case AMDGPU::V_CNDMASK_B16_e32:
2745     case AMDGPU::V_CNDMASK_B16_dpp:
2746     case AMDGPU::V_CNDMASK_B32_e32:
2747     case AMDGPU::V_CNDMASK_B32_dpp:
2748     case AMDGPU::V_DIV_FMAS_F32_e64:
2749     case AMDGPU::V_DIV_FMAS_F64_e64:
2750     case AMDGPU::V_SUBB_U32_e32:
2751     case AMDGPU::V_SUBB_U32_dpp:
2752     case AMDGPU::V_SUBBREV_U32_e32:
2753     case AMDGPU::V_SUBBREV_U32_dpp:
2754       // These implicitly read VCC as mask source.
2755       return HazardReg == AMDGPU::VCC ||
2756              HazardReg == AMDGPU::VCC_LO ||
2757              HazardReg == AMDGPU::VCC_HI;
2758     case AMDGPU::V_ADDC_U32_e64:
2759     case AMDGPU::V_ADDC_U32_e64_dpp:
2760     case AMDGPU::V_CNDMASK_B16_e64:
2761     case AMDGPU::V_CNDMASK_B16_e64_dpp:
2762     case AMDGPU::V_CNDMASK_B32_e64:
2763     case AMDGPU::V_CNDMASK_B32_e64_dpp:
2764     case AMDGPU::V_SUBB_U32_e64:
2765     case AMDGPU::V_SUBB_U32_e64_dpp:
2766     case AMDGPU::V_SUBBREV_U32_e64:
2767     case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2768       // Only check mask register overlaps.
2769       const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2770       assert(SSRCOp);
2771       return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2772     }
2773     default:
2774       return false;
2775     }
2776   };
2777 
2778   const MachineRegisterInfo &MRI = MF.getRegInfo();
2779   auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2780     // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2781     if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2782         AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
2783       return true;
2784 
2785     // VALU access to any SGPR or literal constant other than HazardReg
2786     // mitigates hazard. No need to check HazardReg here as this will
2787     // only be called when !IsHazardFn.
2788     if (!SIInstrInfo::isVALU(I))
2789       return false;
2790     for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2791       const MachineOperand &Op = I.getOperand(OpNo);
2792       if (Op.isReg()) {
2793         Register OpReg = Op.getReg();
2794         // Only consider uses
2795         if (!Op.isUse())
2796           continue;
2797         // Ignore EXEC
2798         if (OpReg == AMDGPU::EXEC ||
2799             OpReg == AMDGPU::EXEC_LO ||
2800             OpReg == AMDGPU::EXEC_HI)
2801           continue;
2802         // Ignore all implicit uses except VCC
2803         if (Op.isImplicit()) {
2804           if (OpReg == AMDGPU::VCC ||
2805               OpReg == AMDGPU::VCC_LO ||
2806               OpReg == AMDGPU::VCC_HI)
2807             return true;
2808           continue;
2809         }
2810         if (TRI.isSGPRReg(MRI, OpReg))
2811           return true;
2812       } else {
2813         const MCInstrDesc &InstDesc = I.getDesc();
2814         const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2815         if (!TII.isInlineConstant(Op, OpInfo))
2816           return true;
2817       }
2818     }
2819     return false;
2820   };
2821 
2822   // Check for hazard
2823   if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2824       std::numeric_limits<int>::max())
2825     return false;
2826 
2827   auto NextMI = std::next(MI->getIterator());
2828 
2829   // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2830   BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2831           TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2832       .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
2833 
2834   // SALU write may be s_getpc in a bundle.
2835   if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2836     // Update offsets of any references in the bundle.
2837     while (NextMI != MI->getParent()->end() &&
2838            NextMI->isBundledWithPred()) {
2839       for (auto &Operand : NextMI->operands()) {
2840         if (Operand.isGlobal())
2841           Operand.setOffset(Operand.getOffset() + 4);
2842       }
2843       NextMI++;
2844     }
2845   }
2846 
2847   return true;
2848 }
2849