1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "SIMachineFunctionInfo.h"
17 #include "llvm/CodeGen/MachineFunction.h"
18 #include "llvm/CodeGen/ScheduleDAG.h"
19 #include "llvm/Support/TargetParser.h"
20
21 using namespace llvm;
22
23 namespace {
24
25 struct MFMAPaddingRatioParser : public cl::parser<unsigned> {
MFMAPaddingRatioParser__anonde425daa0111::MFMAPaddingRatioParser26 MFMAPaddingRatioParser(cl::Option &O) : cl::parser<unsigned>(O) {}
27
parse__anonde425daa0111::MFMAPaddingRatioParser28 bool parse(cl::Option &O, StringRef ArgName, StringRef Arg, unsigned &Value) {
29 if (Arg.getAsInteger(0, Value))
30 return O.error("'" + Arg + "' value invalid for uint argument!");
31
32 if (Value > 100)
33 return O.error("'" + Arg + "' value must be in the range [0, 100]!");
34
35 return false;
36 }
37 };
38
39 } // end anonymous namespace
40
41 static cl::opt<unsigned, false, MFMAPaddingRatioParser>
42 MFMAPaddingRatio("amdgpu-mfma-padding-ratio", cl::init(0), cl::Hidden,
43 cl::desc("Fill a percentage of the latency between "
44 "neighboring MFMA with s_nops."));
45
46 //===----------------------------------------------------------------------===//
47 // Hazard Recognizer Implementation
48 //===----------------------------------------------------------------------===//
49
50 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
51 const GCNSubtarget &ST);
52
GCNHazardRecognizer(const MachineFunction & MF)53 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
54 IsHazardRecognizerMode(false),
55 CurrCycleInstr(nullptr),
56 MF(MF),
57 ST(MF.getSubtarget<GCNSubtarget>()),
58 TII(*ST.getInstrInfo()),
59 TRI(TII.getRegisterInfo()),
60 ClauseUses(TRI.getNumRegUnits()),
61 ClauseDefs(TRI.getNumRegUnits()) {
62 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
63 TSchedModel.init(&ST);
64 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
65 }
66
Reset()67 void GCNHazardRecognizer::Reset() {
68 EmittedInstrs.clear();
69 }
70
EmitInstruction(SUnit * SU)71 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
72 EmitInstruction(SU->getInstr());
73 }
74
EmitInstruction(MachineInstr * MI)75 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
76 CurrCycleInstr = MI;
77 }
78
isDivFMas(unsigned Opcode)79 static bool isDivFMas(unsigned Opcode) {
80 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
81 }
82
isSGetReg(unsigned Opcode)83 static bool isSGetReg(unsigned Opcode) {
84 return Opcode == AMDGPU::S_GETREG_B32;
85 }
86
isSSetReg(unsigned Opcode)87 static bool isSSetReg(unsigned Opcode) {
88 switch (Opcode) {
89 case AMDGPU::S_SETREG_B32:
90 case AMDGPU::S_SETREG_B32_mode:
91 case AMDGPU::S_SETREG_IMM32_B32:
92 case AMDGPU::S_SETREG_IMM32_B32_mode:
93 return true;
94 }
95 return false;
96 }
97
isRWLane(unsigned Opcode)98 static bool isRWLane(unsigned Opcode) {
99 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
100 }
101
isRFE(unsigned Opcode)102 static bool isRFE(unsigned Opcode) {
103 return Opcode == AMDGPU::S_RFE_B64;
104 }
105
isSMovRel(unsigned Opcode)106 static bool isSMovRel(unsigned Opcode) {
107 switch (Opcode) {
108 case AMDGPU::S_MOVRELS_B32:
109 case AMDGPU::S_MOVRELS_B64:
110 case AMDGPU::S_MOVRELD_B32:
111 case AMDGPU::S_MOVRELD_B64:
112 return true;
113 default:
114 return false;
115 }
116 }
117
isDGEMM(unsigned Opcode)118 static bool isDGEMM(unsigned Opcode) {
119 return AMDGPU::getMAIIsDGEMM(Opcode);
120 }
121
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)122 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
123 unsigned Opcode = MI.getOpcode();
124
125 if (!SIInstrInfo::isMAI(MI) ||
126 isDGEMM(Opcode) ||
127 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
128 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
129 return false;
130
131 if (!ST.hasGFX940Insts())
132 return true;
133
134 return AMDGPU::getMAIIsGFX940XDL(Opcode);
135 }
136
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)137 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
138 const MachineInstr &MI) {
139 if (TII.isAlwaysGDS(MI.getOpcode()))
140 return true;
141
142 switch (MI.getOpcode()) {
143 case AMDGPU::S_SENDMSG:
144 case AMDGPU::S_SENDMSGHALT:
145 case AMDGPU::S_TTRACEDATA:
146 return true;
147 // These DS opcodes don't support GDS.
148 case AMDGPU::DS_NOP:
149 case AMDGPU::DS_PERMUTE_B32:
150 case AMDGPU::DS_BPERMUTE_B32:
151 return false;
152 default:
153 if (TII.isDS(MI.getOpcode())) {
154 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
155 AMDGPU::OpName::gds);
156 if (MI.getOperand(GDS).getImm())
157 return true;
158 }
159 return false;
160 }
161 }
162
isPermlane(const MachineInstr & MI)163 static bool isPermlane(const MachineInstr &MI) {
164 unsigned Opcode = MI.getOpcode();
165 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
166 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
167 }
168
isLdsDma(const MachineInstr & MI)169 static bool isLdsDma(const MachineInstr &MI) {
170 return SIInstrInfo::isVALU(MI) &&
171 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI));
172 }
173
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)174 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
175 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
176 AMDGPU::OpName::simm16);
177 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
178 }
179
180 ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)181 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
182 MachineInstr *MI = SU->getInstr();
183 // If we are not in "HazardRecognizerMode" and therefore not being run from
184 // the scheduler, track possible stalls from hazards but don't insert noops.
185 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
186
187 if (MI->isBundle())
188 return NoHazard;
189
190 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
191 return HazardType;
192
193 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
194 return HazardType;
195
196 if (checkFPAtomicToDenormModeHazard(MI) > 0)
197 return HazardType;
198
199 if (ST.hasNoDataDepHazard())
200 return NoHazard;
201
202 // FIXME: Should flat be considered vmem?
203 if ((SIInstrInfo::isVMEM(*MI) ||
204 SIInstrInfo::isFLAT(*MI))
205 && checkVMEMHazards(MI) > 0)
206 return HazardType;
207
208 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
209 return HazardType;
210
211 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
212 return HazardType;
213
214 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
215 return HazardType;
216
217 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
218 return HazardType;
219
220 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
221 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
222 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
223 return HazardType;
224
225 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
226 return HazardType;
227
228 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
229 return HazardType;
230
231 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
232 return HazardType;
233
234 if (((ST.hasReadM0MovRelInterpHazard() &&
235 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
236 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
237 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
238 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
239 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
240 (ST.hasReadM0LdsDirectHazard() &&
241 MI->readsRegister(AMDGPU::LDS_DIRECT))) &&
242 checkReadM0Hazards(MI) > 0)
243 return HazardType;
244
245 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
246 return HazardType;
247
248 if ((SIInstrInfo::isVMEM(*MI) ||
249 SIInstrInfo::isFLAT(*MI) ||
250 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
251 return HazardType;
252
253 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
254 return HazardType;
255
256 return NoHazard;
257 }
258
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)259 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
260 unsigned Quantity) {
261 while (Quantity > 0) {
262 unsigned Arg = std::min(Quantity, 8u);
263 Quantity -= Arg;
264 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
265 .addImm(Arg - 1);
266 }
267 }
268
269 unsigned
getMFMAPipelineWaitStates(const MachineInstr & MI) const270 GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
271 const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
272 assert(TSchedModel.getWriteProcResBegin(SC) !=
273 TSchedModel.getWriteProcResEnd(SC));
274 return TSchedModel.getWriteProcResBegin(SC)->Cycles;
275 }
276
processBundle()277 void GCNHazardRecognizer::processBundle() {
278 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
279 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
280 // Check bundled MachineInstr's for hazards.
281 for (; MI != E && MI->isInsideBundle(); ++MI) {
282 CurrCycleInstr = &*MI;
283 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
284
285 if (IsHazardRecognizerMode) {
286 fixHazards(CurrCycleInstr);
287
288 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
289 }
290
291 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
292 // include the bundled MI directly after, only add a maximum of
293 // (MaxLookAhead - 1) noops to EmittedInstrs.
294 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
295 EmittedInstrs.push_front(nullptr);
296
297 EmittedInstrs.push_front(CurrCycleInstr);
298 EmittedInstrs.resize(MaxLookAhead);
299 }
300 CurrCycleInstr = nullptr;
301 }
302
runOnInstruction(MachineInstr * MI)303 void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
304 assert(IsHazardRecognizerMode);
305
306 unsigned NumPreNoops = PreEmitNoops(MI);
307 EmitNoops(NumPreNoops);
308 if (MI->isInsideBundle())
309 insertNoopsInBundle(MI, TII, NumPreNoops);
310 else
311 TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
312 NumPreNoops);
313 EmitInstruction(MI);
314 AdvanceCycle();
315 }
316
PreEmitNoops(MachineInstr * MI)317 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
318 IsHazardRecognizerMode = true;
319 CurrCycleInstr = MI;
320 unsigned W = PreEmitNoopsCommon(MI);
321 fixHazards(MI);
322 CurrCycleInstr = nullptr;
323 return W;
324 }
325
PreEmitNoopsCommon(MachineInstr * MI)326 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
327 if (MI->isBundle())
328 return 0;
329
330 int WaitStates = 0;
331
332 if (SIInstrInfo::isSMRD(*MI))
333 return std::max(WaitStates, checkSMRDHazards(MI));
334
335 if (ST.hasNSAtoVMEMBug())
336 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
337
338 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
339
340 if (ST.hasNoDataDepHazard())
341 return WaitStates;
342
343 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
344 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
345
346 if (SIInstrInfo::isVALU(*MI))
347 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
348
349 if (SIInstrInfo::isDPP(*MI))
350 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
351
352 if (isDivFMas(MI->getOpcode()))
353 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
354
355 if (isRWLane(MI->getOpcode()))
356 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
357
358 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
359 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
360 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
361 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
362
363 if (MI->isInlineAsm())
364 return std::max(WaitStates, checkInlineAsmHazards(MI));
365
366 if (isSGetReg(MI->getOpcode()))
367 return std::max(WaitStates, checkGetRegHazards(MI));
368
369 if (isSSetReg(MI->getOpcode()))
370 return std::max(WaitStates, checkSetRegHazards(MI));
371
372 if (isRFE(MI->getOpcode()))
373 return std::max(WaitStates, checkRFEHazards(MI));
374
375 if ((ST.hasReadM0MovRelInterpHazard() &&
376 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()) ||
377 MI->getOpcode() == AMDGPU::DS_WRITE_ADDTID_B32 ||
378 MI->getOpcode() == AMDGPU::DS_READ_ADDTID_B32)) ||
379 (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI)) ||
380 (ST.hasReadM0LdsDmaHazard() && isLdsDma(*MI)) ||
381 (ST.hasReadM0LdsDirectHazard() && MI->readsRegister(AMDGPU::LDS_DIRECT)))
382 return std::max(WaitStates, checkReadM0Hazards(MI));
383
384 if (SIInstrInfo::isMAI(*MI))
385 return std::max(WaitStates, checkMAIHazards(MI));
386
387 if (SIInstrInfo::isVMEM(*MI) ||
388 SIInstrInfo::isFLAT(*MI) ||
389 SIInstrInfo::isDS(*MI))
390 return std::max(WaitStates, checkMAILdStHazards(MI));
391
392 return WaitStates;
393 }
394
EmitNoop()395 void GCNHazardRecognizer::EmitNoop() {
396 EmittedInstrs.push_front(nullptr);
397 }
398
AdvanceCycle()399 void GCNHazardRecognizer::AdvanceCycle() {
400 // When the scheduler detects a stall, it will call AdvanceCycle() without
401 // emitting any instructions.
402 if (!CurrCycleInstr) {
403 EmittedInstrs.push_front(nullptr);
404 return;
405 }
406
407 if (CurrCycleInstr->isBundle()) {
408 processBundle();
409 return;
410 }
411
412 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
413 if (!NumWaitStates) {
414 CurrCycleInstr = nullptr;
415 return;
416 }
417
418 // Keep track of emitted instructions
419 EmittedInstrs.push_front(CurrCycleInstr);
420
421 // Add a nullptr for each additional wait state after the first. Make sure
422 // not to add more than getMaxLookAhead() items to the list, since we
423 // truncate the list to that size right after this loop.
424 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
425 i < e; ++i) {
426 EmittedInstrs.push_front(nullptr);
427 }
428
429 // getMaxLookahead() is the largest number of wait states we will ever need
430 // to insert, so there is no point in keeping track of more than that many
431 // wait states.
432 EmittedInstrs.resize(getMaxLookAhead());
433
434 CurrCycleInstr = nullptr;
435 }
436
RecedeCycle()437 void GCNHazardRecognizer::RecedeCycle() {
438 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
439 }
440
441 //===----------------------------------------------------------------------===//
442 // Helper Functions
443 //===----------------------------------------------------------------------===//
444
445 typedef enum { HazardFound, HazardExpired, NoHazardFound } HazardFnResult;
446
447 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
448 typedef function_ref<unsigned int(const MachineInstr &)> GetNumWaitStatesFn;
449
450 // Search for a hazard in a block and its predecessors.
451 template <typename StateT>
452 static bool
hasHazard(StateT State,function_ref<HazardFnResult (StateT &,const MachineInstr &)> IsHazard,function_ref<void (StateT &,const MachineInstr &)> UpdateState,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,DenseSet<const MachineBasicBlock * > & Visited)453 hasHazard(StateT State,
454 function_ref<HazardFnResult(StateT &, const MachineInstr &)> IsHazard,
455 function_ref<void(StateT &, const MachineInstr &)> UpdateState,
456 const MachineBasicBlock *MBB,
457 MachineBasicBlock::const_reverse_instr_iterator I,
458 DenseSet<const MachineBasicBlock *> &Visited) {
459 for (auto E = MBB->instr_rend(); I != E; ++I) {
460 // No need to look at parent BUNDLE instructions.
461 if (I->isBundle())
462 continue;
463
464 switch (IsHazard(State, *I)) {
465 case HazardFound:
466 return true;
467 case HazardExpired:
468 return false;
469 default:
470 // Continue search
471 break;
472 }
473
474 if (I->isInlineAsm() || I->isMetaInstruction())
475 continue;
476
477 UpdateState(State, *I);
478 }
479
480 for (MachineBasicBlock *Pred : MBB->predecessors()) {
481 if (!Visited.insert(Pred).second)
482 continue;
483
484 if (hasHazard(State, IsHazard, UpdateState, Pred, Pred->instr_rbegin(),
485 Visited))
486 return true;
487 }
488
489 return false;
490 }
491
492 // Returns a minimum wait states since \p I walking all predecessors.
493 // Only scans until \p IsExpired does not return true.
494 // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited,GetNumWaitStatesFn GetNumWaitStates=SIInstrInfo::getNumWaitStates)495 static int getWaitStatesSince(
496 GCNHazardRecognizer::IsHazardFn IsHazard, const MachineBasicBlock *MBB,
497 MachineBasicBlock::const_reverse_instr_iterator I, int WaitStates,
498 IsExpiredFn IsExpired, DenseSet<const MachineBasicBlock *> &Visited,
499 GetNumWaitStatesFn GetNumWaitStates = SIInstrInfo::getNumWaitStates) {
500 for (auto E = MBB->instr_rend(); I != E; ++I) {
501 // Don't add WaitStates for parent BUNDLE instructions.
502 if (I->isBundle())
503 continue;
504
505 if (IsHazard(*I))
506 return WaitStates;
507
508 if (I->isInlineAsm())
509 continue;
510
511 WaitStates += GetNumWaitStates(*I);
512
513 if (IsExpired(*I, WaitStates))
514 return std::numeric_limits<int>::max();
515 }
516
517 int MinWaitStates = std::numeric_limits<int>::max();
518 for (MachineBasicBlock *Pred : MBB->predecessors()) {
519 if (!Visited.insert(Pred).second)
520 continue;
521
522 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(), WaitStates,
523 IsExpired, Visited, GetNumWaitStates);
524
525 MinWaitStates = std::min(MinWaitStates, W);
526 }
527
528 return MinWaitStates;
529 }
530
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)531 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
532 const MachineInstr *MI, IsExpiredFn IsExpired) {
533 DenseSet<const MachineBasicBlock *> Visited;
534 return getWaitStatesSince(IsHazard, MI->getParent(),
535 std::next(MI->getReverseIterator()),
536 0, IsExpired, Visited);
537 }
538
getWaitStatesSince(IsHazardFn IsHazard,int Limit)539 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
540 if (IsHazardRecognizerMode) {
541 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
542 return WaitStates >= Limit;
543 };
544 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
545 }
546
547 int WaitStates = 0;
548 for (MachineInstr *MI : EmittedInstrs) {
549 if (MI) {
550 if (IsHazard(*MI))
551 return WaitStates;
552
553 if (MI->isInlineAsm())
554 continue;
555 }
556 ++WaitStates;
557
558 if (WaitStates >= Limit)
559 break;
560 }
561 return std::numeric_limits<int>::max();
562 }
563
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)564 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
565 IsHazardFn IsHazardDef,
566 int Limit) {
567 const SIRegisterInfo *TRI = ST.getRegisterInfo();
568
569 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
570 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
571 };
572
573 return getWaitStatesSince(IsHazardFn, Limit);
574 }
575
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)576 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
577 int Limit) {
578 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
579 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
580 };
581
582 return getWaitStatesSince(IsHazardFn, Limit);
583 }
584
585 //===----------------------------------------------------------------------===//
586 // No-op Hazard Detection
587 //===----------------------------------------------------------------------===//
588
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)589 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
590 MCRegister Reg) {
591 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
592 BV.set(*RUI);
593 }
594
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & Set)595 static void addRegsToSet(const SIRegisterInfo &TRI,
596 iterator_range<MachineInstr::const_mop_iterator> Ops,
597 BitVector &Set) {
598 for (const MachineOperand &Op : Ops) {
599 if (Op.isReg())
600 addRegUnits(TRI, Set, Op.getReg().asMCReg());
601 }
602 }
603
addClauseInst(const MachineInstr & MI)604 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
605 // XXX: Do we need to worry about implicit operands
606 addRegsToSet(TRI, MI.defs(), ClauseDefs);
607 addRegsToSet(TRI, MI.uses(), ClauseUses);
608 }
609
breaksSMEMSoftClause(MachineInstr * MI)610 static bool breaksSMEMSoftClause(MachineInstr *MI) {
611 return !SIInstrInfo::isSMRD(*MI);
612 }
613
breaksVMEMSoftClause(MachineInstr * MI)614 static bool breaksVMEMSoftClause(MachineInstr *MI) {
615 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
616 }
617
checkSoftClauseHazards(MachineInstr * MEM)618 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
619 // SMEM soft clause are only present on VI+, and only matter if xnack is
620 // enabled.
621 if (!ST.isXNACKEnabled())
622 return 0;
623
624 bool IsSMRD = TII.isSMRD(*MEM);
625
626 resetClause();
627
628 // A soft-clause is any group of consecutive SMEM instructions. The
629 // instructions in this group may return out of order and/or may be
630 // replayed (i.e. the same instruction issued more than once).
631 //
632 // In order to handle these situations correctly we need to make sure that
633 // when a clause has more than one instruction, no instruction in the clause
634 // writes to a register that is read by another instruction in the clause
635 // (including itself). If we encounter this situation, we need to break the
636 // clause by inserting a non SMEM instruction.
637
638 for (MachineInstr *MI : EmittedInstrs) {
639 // When we hit a non-SMEM instruction then we have passed the start of the
640 // clause and we can stop.
641 if (!MI)
642 break;
643
644 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
645 break;
646
647 addClauseInst(*MI);
648 }
649
650 if (ClauseDefs.none())
651 return 0;
652
653 // We need to make sure not to put loads and stores in the same clause if they
654 // use the same address. For now, just start a new clause whenever we see a
655 // store.
656 if (MEM->mayStore())
657 return 1;
658
659 addClauseInst(*MEM);
660
661 // If the set of defs and uses intersect then we cannot add this instruction
662 // to the clause, so we have a hazard.
663 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
664 }
665
checkSMRDHazards(MachineInstr * SMRD)666 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
667 int WaitStatesNeeded = 0;
668
669 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
670
671 // This SMRD hazard only affects SI.
672 if (!ST.hasSMRDReadVALUDefHazard())
673 return WaitStatesNeeded;
674
675 // A read of an SGPR by SMRD instruction requires 4 wait states when the
676 // SGPR was written by a VALU instruction.
677 int SmrdSgprWaitStates = 4;
678 auto IsHazardDefFn = [this](const MachineInstr &MI) {
679 return TII.isVALU(MI);
680 };
681 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
682 return TII.isSALU(MI);
683 };
684
685 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
686
687 for (const MachineOperand &Use : SMRD->uses()) {
688 if (!Use.isReg())
689 continue;
690 int WaitStatesNeededForUse =
691 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
692 SmrdSgprWaitStates);
693 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
694
695 // This fixes what appears to be undocumented hardware behavior in SI where
696 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
697 // needs some number of nops in between. We don't know how many we need, but
698 // let's use 4. This wasn't discovered before probably because the only
699 // case when this happens is when we expand a 64-bit pointer into a full
700 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
701 // probably never encountered in the closed-source land.
702 if (IsBufferSMRD) {
703 int WaitStatesNeededForUse =
704 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
705 IsBufferHazardDefFn,
706 SmrdSgprWaitStates);
707 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
708 }
709 }
710
711 return WaitStatesNeeded;
712 }
713
checkVMEMHazards(MachineInstr * VMEM)714 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
715 if (!ST.hasVMEMReadSGPRVALUDefHazard())
716 return 0;
717
718 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
719
720 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
721 // SGPR was written by a VALU Instruction.
722 const int VmemSgprWaitStates = 5;
723 auto IsHazardDefFn = [this](const MachineInstr &MI) {
724 return TII.isVALU(MI);
725 };
726 for (const MachineOperand &Use : VMEM->uses()) {
727 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
728 continue;
729
730 int WaitStatesNeededForUse =
731 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
732 VmemSgprWaitStates);
733 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
734 }
735 return WaitStatesNeeded;
736 }
737
checkDPPHazards(MachineInstr * DPP)738 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
739 const SIRegisterInfo *TRI = ST.getRegisterInfo();
740 const SIInstrInfo *TII = ST.getInstrInfo();
741
742 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
743 int DppVgprWaitStates = 2;
744 int DppExecWaitStates = 5;
745 int WaitStatesNeeded = 0;
746 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
747 return TII->isVALU(MI);
748 };
749
750 for (const MachineOperand &Use : DPP->uses()) {
751 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
752 continue;
753 int WaitStatesNeededForUse =
754 DppVgprWaitStates - getWaitStatesSinceDef(
755 Use.getReg(),
756 [](const MachineInstr &) { return true; },
757 DppVgprWaitStates);
758 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
759 }
760
761 WaitStatesNeeded = std::max(
762 WaitStatesNeeded,
763 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
764 DppExecWaitStates));
765
766 return WaitStatesNeeded;
767 }
768
checkDivFMasHazards(MachineInstr * DivFMas)769 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
770 const SIInstrInfo *TII = ST.getInstrInfo();
771
772 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
773 // instruction.
774 const int DivFMasWaitStates = 4;
775 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
776 return TII->isVALU(MI);
777 };
778 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
779 DivFMasWaitStates);
780
781 return DivFMasWaitStates - WaitStatesNeeded;
782 }
783
checkGetRegHazards(MachineInstr * GetRegInstr)784 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
785 const SIInstrInfo *TII = ST.getInstrInfo();
786 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
787
788 const int GetRegWaitStates = 2;
789 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
790 return GetRegHWReg == getHWReg(TII, MI);
791 };
792 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
793
794 return GetRegWaitStates - WaitStatesNeeded;
795 }
796
checkSetRegHazards(MachineInstr * SetRegInstr)797 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
798 const SIInstrInfo *TII = ST.getInstrInfo();
799 unsigned HWReg = getHWReg(TII, *SetRegInstr);
800
801 const int SetRegWaitStates = ST.getSetRegWaitStates();
802 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
803 return HWReg == getHWReg(TII, MI);
804 };
805 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
806 return SetRegWaitStates - WaitStatesNeeded;
807 }
808
createsVALUHazard(const MachineInstr & MI)809 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
810 if (!MI.mayStore())
811 return -1;
812
813 const SIInstrInfo *TII = ST.getInstrInfo();
814 unsigned Opcode = MI.getOpcode();
815 const MCInstrDesc &Desc = MI.getDesc();
816
817 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
818 int VDataRCID = -1;
819 if (VDataIdx != -1)
820 VDataRCID = Desc.operands()[VDataIdx].RegClass;
821
822 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
823 // There is no hazard if the instruction does not use vector regs
824 // (like wbinvl1)
825 if (VDataIdx == -1)
826 return -1;
827 // For MUBUF/MTBUF instructions this hazard only exists if the
828 // instruction is not using a register in the soffset field.
829 const MachineOperand *SOffset =
830 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
831 // If we have no soffset operand, then assume this field has been
832 // hardcoded to zero.
833 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
834 (!SOffset || !SOffset->isReg()))
835 return VDataIdx;
836 }
837
838 // MIMG instructions create a hazard if they don't use a 256-bit T# and
839 // the store size is greater than 8 bytes and they have more than two bits
840 // of their dmask set.
841 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
842 if (TII->isMIMG(MI)) {
843 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
844 assert(SRsrcIdx != -1 &&
845 AMDGPU::getRegBitWidth(Desc.operands()[SRsrcIdx].RegClass) == 256);
846 (void)SRsrcIdx;
847 }
848
849 if (TII->isFLAT(MI)) {
850 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
851 if (AMDGPU::getRegBitWidth(Desc.operands()[DataIdx].RegClass) > 64)
852 return DataIdx;
853 }
854
855 return -1;
856 }
857
858 int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)859 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
860 const MachineRegisterInfo &MRI) {
861 // Helper to check for the hazard where VMEM instructions that store more than
862 // 8 bytes can have there store data over written by the next instruction.
863 const SIRegisterInfo *TRI = ST.getRegisterInfo();
864
865 const int VALUWaitStates = ST.hasGFX940Insts() ? 2 : 1;
866 int WaitStatesNeeded = 0;
867
868 if (!TRI->isVectorRegister(MRI, Def.getReg()))
869 return WaitStatesNeeded;
870 Register Reg = Def.getReg();
871 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
872 int DataIdx = createsVALUHazard(MI);
873 return DataIdx >= 0 &&
874 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
875 };
876 int WaitStatesNeededForDef =
877 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
878 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
879
880 return WaitStatesNeeded;
881 }
882
checkVALUHazards(MachineInstr * VALU)883 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
884 int WaitStatesNeeded = 0;
885
886 if (ST.hasTransForwardingHazard() && !SIInstrInfo::isTRANS(*VALU)) {
887 const int TransDefWaitstates = 1;
888
889 auto IsTransDefFn = [this, VALU](const MachineInstr &MI) {
890 if (!SIInstrInfo::isTRANS(MI))
891 return false;
892 const SIRegisterInfo *TRI = ST.getRegisterInfo();
893 const SIInstrInfo *TII = ST.getInstrInfo();
894 Register Def = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)->getReg();
895
896 for (const MachineOperand &Use : VALU->explicit_uses()) {
897 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
898 return true;
899 }
900
901 return false;
902 };
903
904 int WaitStatesNeededForDef =
905 TransDefWaitstates -
906 getWaitStatesSince(IsTransDefFn, TransDefWaitstates);
907 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
908 }
909
910 if (ST.hasDstSelForwardingHazard()) {
911 const int Shift16DefWaitstates = 1;
912
913 auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
914 if (!SIInstrInfo::isVALU(MI))
915 return false;
916 const SIInstrInfo *TII = ST.getInstrInfo();
917 if (SIInstrInfo::isSDWA(MI)) {
918 if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
919 if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
920 return false;
921 } else {
922 if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
923 !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
924 ->getImm() &
925 SISrcMods::DST_OP_SEL))
926 return false;
927 }
928 const SIRegisterInfo *TRI = ST.getRegisterInfo();
929 if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
930 Register Def = Dst->getReg();
931
932 for (const MachineOperand &Use : VALU->explicit_uses()) {
933 if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
934 return true;
935 }
936 }
937
938 return false;
939 };
940
941 int WaitStatesNeededForDef =
942 Shift16DefWaitstates -
943 getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
944 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
945 }
946
947 if (ST.hasVDecCoExecHazard()) {
948 const int VALUWriteSGPRVALUReadWaitstates = 2;
949 const int VALUWriteEXECRWLane = 4;
950 const int VALUWriteVGPRReadlaneRead = 1;
951
952 const SIRegisterInfo *TRI = ST.getRegisterInfo();
953 const MachineRegisterInfo &MRI = MF.getRegInfo();
954 Register UseReg;
955 auto IsVALUDefSGPRFn = [&UseReg, TRI](const MachineInstr &MI) {
956 if (!SIInstrInfo::isVALU(MI))
957 return false;
958 return MI.modifiesRegister(UseReg, TRI);
959 };
960
961 for (const MachineOperand &Use : VALU->explicit_uses()) {
962 if (!Use.isReg())
963 continue;
964
965 UseReg = Use.getReg();
966 if (TRI->isSGPRReg(MRI, UseReg)) {
967 int WaitStatesNeededForDef =
968 VALUWriteSGPRVALUReadWaitstates -
969 getWaitStatesSince(IsVALUDefSGPRFn,
970 VALUWriteSGPRVALUReadWaitstates);
971 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
972 }
973 }
974
975 if (VALU->readsRegister(AMDGPU::VCC, TRI)) {
976 UseReg = AMDGPU::VCC;
977 int WaitStatesNeededForDef =
978 VALUWriteSGPRVALUReadWaitstates -
979 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteSGPRVALUReadWaitstates);
980 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
981 }
982
983 switch (VALU->getOpcode()) {
984 case AMDGPU::V_READLANE_B32:
985 case AMDGPU::V_READFIRSTLANE_B32: {
986 MachineOperand *Src = TII.getNamedOperand(*VALU, AMDGPU::OpName::src0);
987 UseReg = Src->getReg();
988 int WaitStatesNeededForDef =
989 VALUWriteVGPRReadlaneRead -
990 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteVGPRReadlaneRead);
991 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
992 }
993 [[fallthrough]];
994 case AMDGPU::V_WRITELANE_B32: {
995 UseReg = AMDGPU::EXEC;
996 int WaitStatesNeededForDef =
997 VALUWriteEXECRWLane -
998 getWaitStatesSince(IsVALUDefSGPRFn, VALUWriteEXECRWLane);
999 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1000 break;
1001 }
1002 default:
1003 break;
1004 }
1005 }
1006
1007 // This checks for the hazard where VMEM instructions that store more than
1008 // 8 bytes can have there store data over written by the next instruction.
1009 if (!ST.has12DWordStoreHazard())
1010 return WaitStatesNeeded;
1011
1012 const MachineRegisterInfo &MRI = MF.getRegInfo();
1013
1014 for (const MachineOperand &Def : VALU->defs()) {
1015 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
1016 }
1017
1018 return WaitStatesNeeded;
1019 }
1020
checkInlineAsmHazards(MachineInstr * IA)1021 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1022 // This checks for hazards associated with inline asm statements.
1023 // Since inline asms can contain just about anything, we use this
1024 // to call/leverage other check*Hazard routines. Note that
1025 // this function doesn't attempt to address all possible inline asm
1026 // hazards (good luck), but is a collection of what has been
1027 // problematic thus far.
1028
1029 // see checkVALUHazards()
1030 if (!ST.has12DWordStoreHazard())
1031 return 0;
1032
1033 const MachineRegisterInfo &MRI = MF.getRegInfo();
1034 int WaitStatesNeeded = 0;
1035
1036 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
1037 I != E; ++I) {
1038 const MachineOperand &Op = IA->getOperand(I);
1039 if (Op.isReg() && Op.isDef()) {
1040 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1041 }
1042 }
1043
1044 return WaitStatesNeeded;
1045 }
1046
checkRWLaneHazards(MachineInstr * RWLane)1047 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
1048 const SIInstrInfo *TII = ST.getInstrInfo();
1049 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1050 const MachineRegisterInfo &MRI = MF.getRegInfo();
1051
1052 const MachineOperand *LaneSelectOp =
1053 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
1054
1055 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
1056 return 0;
1057
1058 Register LaneSelectReg = LaneSelectOp->getReg();
1059 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
1060
1061 const int RWLaneWaitStates = 4;
1062 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
1063 RWLaneWaitStates);
1064 return RWLaneWaitStates - WaitStatesSince;
1065 }
1066
checkRFEHazards(MachineInstr * RFE)1067 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
1068 if (!ST.hasRFEHazards())
1069 return 0;
1070
1071 const SIInstrInfo *TII = ST.getInstrInfo();
1072
1073 const int RFEWaitStates = 1;
1074
1075 auto IsHazardFn = [TII](const MachineInstr &MI) {
1076 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
1077 };
1078 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
1079 return RFEWaitStates - WaitStatesNeeded;
1080 }
1081
checkReadM0Hazards(MachineInstr * MI)1082 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
1083 const SIInstrInfo *TII = ST.getInstrInfo();
1084 const int ReadM0WaitStates = 1;
1085 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
1086 return ReadM0WaitStates -
1087 getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn, ReadM0WaitStates);
1088 }
1089
fixHazards(MachineInstr * MI)1090 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1091 fixVMEMtoScalarWriteHazards(MI);
1092 fixVcmpxPermlaneHazards(MI);
1093 fixSMEMtoVectorWriteHazards(MI);
1094 fixVcmpxExecWARHazard(MI);
1095 fixLdsBranchVmemWARHazard(MI);
1096 if (ST.hasLdsDirect()) {
1097 fixLdsDirectVALUHazard(MI);
1098 fixLdsDirectVMEMHazard(MI);
1099 }
1100 fixVALUPartialForwardingHazard(MI);
1101 fixVALUTransUseHazard(MI);
1102 fixWMMAHazards(MI);
1103 fixShift64HighRegBug(MI);
1104 fixVALUMaskWriteHazard(MI);
1105 }
1106
fixVcmpxPermlaneHazards(MachineInstr * MI)1107 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
1108 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
1109 return false;
1110
1111 const SIInstrInfo *TII = ST.getInstrInfo();
1112 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1113 auto IsHazardFn = [TII, TRI](const MachineInstr &MI) {
1114 return (TII->isVOPC(MI) ||
1115 ((TII->isVOP3(MI) || TII->isSDWA(MI)) && MI.isCompare())) &&
1116 MI.modifiesRegister(AMDGPU::EXEC, TRI);
1117 };
1118
1119 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1120 unsigned Opc = MI.getOpcode();
1121 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
1122 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
1123 };
1124
1125 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1126 std::numeric_limits<int>::max())
1127 return false;
1128
1129 // V_NOP will be discarded by SQ.
1130 // Use V_MOV_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
1131 // which is always a VGPR and available.
1132 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
1133 Register Reg = Src0->getReg();
1134 bool IsUndef = Src0->isUndef();
1135 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1136 TII->get(AMDGPU::V_MOV_B32_e32))
1137 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
1138 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
1139
1140 return true;
1141 }
1142
fixVMEMtoScalarWriteHazards(MachineInstr * MI)1143 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
1144 if (!ST.hasVMEMtoScalarWriteHazard())
1145 return false;
1146
1147 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
1148 return false;
1149
1150 if (MI->getNumDefs() == 0)
1151 return false;
1152
1153 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1154
1155 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
1156 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
1157 !SIInstrInfo::isFLAT(I))
1158 return false;
1159
1160 for (const MachineOperand &Def : MI->defs()) {
1161 const MachineOperand *Op =
1162 I.findRegisterUseOperand(Def.getReg(), false, TRI);
1163 if (!Op)
1164 continue;
1165 return true;
1166 }
1167 return false;
1168 };
1169
1170 auto IsExpiredFn = [](const MachineInstr &MI, int) {
1171 return SIInstrInfo::isVALU(MI) ||
1172 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
1173 !MI.getOperand(0).getImm()) ||
1174 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1175 MI.getOperand(0).getImm() == 0xffe3);
1176 };
1177
1178 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1179 std::numeric_limits<int>::max())
1180 return false;
1181
1182 const SIInstrInfo *TII = ST.getInstrInfo();
1183 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1184 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1185 .addImm(0xffe3);
1186 return true;
1187 }
1188
fixSMEMtoVectorWriteHazards(MachineInstr * MI)1189 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
1190 if (!ST.hasSMEMtoVectorWriteHazard())
1191 return false;
1192
1193 if (!SIInstrInfo::isVALU(*MI))
1194 return false;
1195
1196 unsigned SDSTName;
1197 switch (MI->getOpcode()) {
1198 case AMDGPU::V_READLANE_B32:
1199 case AMDGPU::V_READFIRSTLANE_B32:
1200 SDSTName = AMDGPU::OpName::vdst;
1201 break;
1202 default:
1203 SDSTName = AMDGPU::OpName::sdst;
1204 break;
1205 }
1206
1207 const SIInstrInfo *TII = ST.getInstrInfo();
1208 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1209 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
1210 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
1211 if (!SDST) {
1212 for (const auto &MO : MI->implicit_operands()) {
1213 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg()))) {
1214 SDST = &MO;
1215 break;
1216 }
1217 }
1218 }
1219
1220 if (!SDST)
1221 return false;
1222
1223 const Register SDSTReg = SDST->getReg();
1224 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
1225 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
1226 };
1227
1228 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
1229 if (TII->isSALU(MI)) {
1230 switch (MI.getOpcode()) {
1231 case AMDGPU::S_SETVSKIP:
1232 case AMDGPU::S_VERSION:
1233 case AMDGPU::S_WAITCNT_VSCNT:
1234 case AMDGPU::S_WAITCNT_VMCNT:
1235 case AMDGPU::S_WAITCNT_EXPCNT:
1236 // These instructions cannot not mitigate the hazard.
1237 return false;
1238 case AMDGPU::S_WAITCNT_LGKMCNT:
1239 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1240 return (MI.getOperand(1).getImm() == 0) &&
1241 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1242 case AMDGPU::S_WAITCNT: {
1243 const int64_t Imm = MI.getOperand(0).getImm();
1244 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1245 return (Decoded.LgkmCnt == 0);
1246 }
1247 default:
1248 // SOPP instructions cannot mitigate the hazard.
1249 if (TII->isSOPP(MI))
1250 return false;
1251 // At this point the SALU can be assumed to mitigate the hazard
1252 // because either:
1253 // (a) it is independent of the at risk SMEM (breaking chain),
1254 // or
1255 // (b) it is dependent on the SMEM, in which case an appropriate
1256 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1257 // SMEM instruction.
1258 return true;
1259 }
1260 }
1261 return false;
1262 };
1263
1264 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1265 std::numeric_limits<int>::max())
1266 return false;
1267
1268 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1269 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1270 .addImm(0);
1271 return true;
1272 }
1273
fixVcmpxExecWARHazard(MachineInstr * MI)1274 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1275 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1276 return false;
1277
1278 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1279 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1280 return false;
1281
1282 auto IsHazardFn = [TRI](const MachineInstr &I) {
1283 if (SIInstrInfo::isVALU(I))
1284 return false;
1285 return I.readsRegister(AMDGPU::EXEC, TRI);
1286 };
1287
1288 const SIInstrInfo *TII = ST.getInstrInfo();
1289 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1290 if (SIInstrInfo::isVALU(MI)) {
1291 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1292 return true;
1293 for (auto MO : MI.implicit_operands())
1294 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegBaseClass(MO.getReg())))
1295 return true;
1296 }
1297 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1298 (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1299 return true;
1300 return false;
1301 };
1302
1303 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1304 std::numeric_limits<int>::max())
1305 return false;
1306
1307 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1308 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1309 .addImm(0xfffe);
1310 return true;
1311 }
1312
shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction & MF,const GCNSubtarget & ST)1313 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1314 const GCNSubtarget &ST) {
1315 if (!ST.hasLdsBranchVmemWARHazard())
1316 return false;
1317
1318 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1319 // instructions need to appear in the same function.
1320 bool HasLds = false;
1321 bool HasVmem = false;
1322 for (auto &MBB : MF) {
1323 for (auto &MI : MBB) {
1324 HasLds |= SIInstrInfo::isDS(MI);
1325 HasVmem |=
1326 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1327 if (HasLds && HasVmem)
1328 return true;
1329 }
1330 }
1331 return false;
1332 }
1333
isStoreCountWaitZero(const MachineInstr & I)1334 static bool isStoreCountWaitZero(const MachineInstr &I) {
1335 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1336 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1337 !I.getOperand(1).getImm();
1338 }
1339
fixLdsBranchVmemWARHazard(MachineInstr * MI)1340 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1341 if (!RunLdsBranchVmemWARHazardFixup)
1342 return false;
1343
1344 assert(ST.hasLdsBranchVmemWARHazard());
1345
1346 auto IsHazardInst = [](const MachineInstr &MI) {
1347 if (SIInstrInfo::isDS(MI))
1348 return 1;
1349 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1350 return 2;
1351 return 0;
1352 };
1353
1354 auto InstType = IsHazardInst(*MI);
1355 if (!InstType)
1356 return false;
1357
1358 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1359 return IsHazardInst(I) || isStoreCountWaitZero(I);
1360 };
1361
1362 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1363 if (!I.isBranch())
1364 return false;
1365
1366 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1367 auto InstType2 = IsHazardInst(I);
1368 return InstType2 && InstType != InstType2;
1369 };
1370
1371 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1372 auto InstType2 = IsHazardInst(I);
1373 if (InstType == InstType2)
1374 return true;
1375
1376 return isStoreCountWaitZero(I);
1377 };
1378
1379 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1380 std::numeric_limits<int>::max();
1381 };
1382
1383 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1384 std::numeric_limits<int>::max())
1385 return false;
1386
1387 const SIInstrInfo *TII = ST.getInstrInfo();
1388 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1389 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1390 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1391 .addImm(0);
1392
1393 return true;
1394 }
1395
fixLdsDirectVALUHazard(MachineInstr * MI)1396 bool GCNHazardRecognizer::fixLdsDirectVALUHazard(MachineInstr *MI) {
1397 if (!SIInstrInfo::isLDSDIR(*MI))
1398 return false;
1399
1400 const int NoHazardWaitStates = 15;
1401 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1402 const Register VDSTReg = VDST->getReg();
1403
1404 bool VisitedTrans = false;
1405 auto IsHazardFn = [this, VDSTReg, &VisitedTrans](const MachineInstr &I) {
1406 if (!SIInstrInfo::isVALU(I))
1407 return false;
1408 VisitedTrans = VisitedTrans || SIInstrInfo::isTRANS(I);
1409 // Cover both WAR and WAW
1410 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1411 };
1412 auto IsExpiredFn = [&](const MachineInstr &I, int WaitStates) {
1413 if (WaitStates >= NoHazardWaitStates)
1414 return true;
1415 // Instructions which cause va_vdst==0 expire hazard
1416 return SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1417 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I);
1418 };
1419 auto GetWaitStatesFn = [](const MachineInstr &MI) {
1420 return SIInstrInfo::isVALU(MI) ? 1 : 0;
1421 };
1422
1423 DenseSet<const MachineBasicBlock *> Visited;
1424 auto Count = ::getWaitStatesSince(IsHazardFn, MI->getParent(),
1425 std::next(MI->getReverseIterator()), 0,
1426 IsExpiredFn, Visited, GetWaitStatesFn);
1427
1428 // Transcendentals can execute in parallel to other VALUs.
1429 // This makes va_vdst count unusable with a mixture of VALU and TRANS.
1430 if (VisitedTrans)
1431 Count = 0;
1432
1433 MachineOperand *WaitVdstOp =
1434 TII.getNamedOperand(*MI, AMDGPU::OpName::waitvdst);
1435 WaitVdstOp->setImm(std::min(Count, NoHazardWaitStates));
1436
1437 return true;
1438 }
1439
fixLdsDirectVMEMHazard(MachineInstr * MI)1440 bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
1441 if (!SIInstrInfo::isLDSDIR(*MI))
1442 return false;
1443
1444 const MachineOperand *VDST = TII.getNamedOperand(*MI, AMDGPU::OpName::vdst);
1445 const Register VDSTReg = VDST->getReg();
1446
1447 auto IsHazardFn = [this, VDSTReg](const MachineInstr &I) {
1448 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I) &&
1449 !SIInstrInfo::isDS(I))
1450 return false;
1451 return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
1452 };
1453 auto IsExpiredFn = [](const MachineInstr &I, int) {
1454 return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
1455 (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
1456 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1457 I.getOperand(0).getImm() == 0xffe3);
1458 };
1459
1460 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1461 std::numeric_limits<int>::max())
1462 return false;
1463
1464 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1465 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1466 .addImm(0xffe3);
1467
1468 return true;
1469 }
1470
fixVALUPartialForwardingHazard(MachineInstr * MI)1471 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
1472 if (!ST.isWave64())
1473 return false;
1474 if (!ST.hasVALUPartialForwardingHazard())
1475 return false;
1476 if (!SIInstrInfo::isVALU(*MI))
1477 return false;
1478
1479 SmallSetVector<Register, 4> SrcVGPRs;
1480
1481 for (const MachineOperand &Use : MI->explicit_uses()) {
1482 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1483 SrcVGPRs.insert(Use.getReg());
1484 }
1485
1486 // Only applies with >= 2 unique VGPR sources
1487 if (SrcVGPRs.size() <= 1)
1488 return false;
1489
1490 // Look for the following pattern:
1491 // Va <- VALU [PreExecPos]
1492 // intv1
1493 // Exec <- SALU [ExecPos]
1494 // intv2
1495 // Vb <- VALU [PostExecPos]
1496 // intv3
1497 // MI Va, Vb (WaitState = 0)
1498 //
1499 // Where:
1500 // intv1 + intv2 <= 2 VALUs
1501 // intv3 <= 4 VALUs
1502 //
1503 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1504
1505 const int Intv1plus2MaxVALUs = 2;
1506 const int Intv3MaxVALUs = 4;
1507 const int IntvMaxVALUs = 6;
1508 const int NoHazardVALUWaitStates = IntvMaxVALUs + 2;
1509
1510 struct StateType {
1511 SmallDenseMap<Register, int, 4> DefPos;
1512 int ExecPos = std::numeric_limits<int>::max();
1513 int VALUs = 0;
1514 };
1515
1516 StateType State;
1517
1518 // This overloads expiry testing with all the hazard detection
1519 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1520 // Too many VALU states have passed
1521 if (State.VALUs > NoHazardVALUWaitStates)
1522 return HazardExpired;
1523
1524 // Instructions which cause va_vdst==0 expire hazard
1525 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1526 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1527 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1528 I.getOperand(0).getImm() == 0x0fff))
1529 return HazardExpired;
1530
1531 // Track registers writes
1532 bool Changed = false;
1533 if (SIInstrInfo::isVALU(I)) {
1534 for (Register Src : SrcVGPRs) {
1535 if (!State.DefPos.count(Src) && I.modifiesRegister(Src, &TRI)) {
1536 State.DefPos[Src] = State.VALUs;
1537 Changed = true;
1538 }
1539 }
1540 } else if (SIInstrInfo::isSALU(I)) {
1541 if (State.ExecPos == std::numeric_limits<int>::max()) {
1542 if (!State.DefPos.empty() && I.modifiesRegister(AMDGPU::EXEC, &TRI)) {
1543 State.ExecPos = State.VALUs;
1544 Changed = true;
1545 }
1546 }
1547 }
1548
1549 // Early expiration: too many VALUs in intv3
1550 if (State.VALUs > Intv3MaxVALUs && State.DefPos.empty())
1551 return HazardExpired;
1552
1553 // Only evaluate state if something changed
1554 if (!Changed)
1555 return NoHazardFound;
1556
1557 // Determine positions of VALUs pre/post exec change
1558 if (State.ExecPos == std::numeric_limits<int>::max())
1559 return NoHazardFound;
1560
1561 int PreExecPos = std::numeric_limits<int>::max();
1562 int PostExecPos = std::numeric_limits<int>::max();
1563
1564 for (auto Entry : State.DefPos) {
1565 int DefVALUs = Entry.second;
1566 if (DefVALUs != std::numeric_limits<int>::max()) {
1567 if (DefVALUs >= State.ExecPos)
1568 PreExecPos = std::min(PreExecPos, DefVALUs);
1569 else if (DefVALUs < State.ExecPos)
1570 PostExecPos = std::min(PostExecPos, DefVALUs);
1571 }
1572 }
1573
1574 // Need a VALUs post exec change
1575 if (PostExecPos == std::numeric_limits<int>::max())
1576 return NoHazardFound;
1577
1578 // Too many VALUs in intv3?
1579 int Intv3VALUs = PostExecPos;
1580 if (Intv3VALUs > Intv3MaxVALUs)
1581 return HazardExpired;
1582
1583 // Too many VALUs in intv2?
1584 int Intv2VALUs = (State.ExecPos - PostExecPos) - 1;
1585 if (Intv2VALUs > Intv1plus2MaxVALUs)
1586 return HazardExpired;
1587
1588 // Need a VALUs pre exec change
1589 if (PreExecPos == std::numeric_limits<int>::max())
1590 return NoHazardFound;
1591
1592 // Too many VALUs in intv1?
1593 int Intv1VALUs = PreExecPos - State.ExecPos;
1594 if (Intv1VALUs > Intv1plus2MaxVALUs)
1595 return HazardExpired;
1596
1597 // Too many VALUs in intv1 + intv2
1598 if (Intv1VALUs + Intv2VALUs > Intv1plus2MaxVALUs)
1599 return HazardExpired;
1600
1601 return HazardFound;
1602 };
1603 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1604 if (SIInstrInfo::isVALU(MI))
1605 State.VALUs += 1;
1606 };
1607
1608 DenseSet<const MachineBasicBlock *> Visited;
1609 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1610 std::next(MI->getReverseIterator()), Visited))
1611 return false;
1612
1613 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1614 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1615 .addImm(0x0fff);
1616
1617 return true;
1618 }
1619
fixVALUTransUseHazard(MachineInstr * MI)1620 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
1621 if (!ST.hasVALUTransUseHazard())
1622 return false;
1623 if (!SIInstrInfo::isVALU(*MI))
1624 return false;
1625
1626 SmallSet<Register, 4> SrcVGPRs;
1627
1628 for (const MachineOperand &Use : MI->explicit_uses()) {
1629 if (Use.isReg() && TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1630 SrcVGPRs.insert(Use.getReg());
1631 }
1632
1633 // Look for the following pattern:
1634 // Va <- TRANS VALU
1635 // intv
1636 // MI Va (WaitState = 0)
1637 //
1638 // Where:
1639 // intv <= 5 VALUs / 1 TRANS
1640 //
1641 // If found, insert an appropriate S_WAITCNT_DEPCTR before MI.
1642
1643 const int IntvMaxVALUs = 5;
1644 const int IntvMaxTRANS = 1;
1645
1646 struct StateType {
1647 int VALUs = 0;
1648 int TRANS = 0;
1649 };
1650
1651 StateType State;
1652
1653 // This overloads expiry testing with all the hazard detection
1654 auto IsHazardFn = [&, this](StateType &State, const MachineInstr &I) {
1655 // Too many VALU states have passed
1656 if (State.VALUs > IntvMaxVALUs || State.TRANS > IntvMaxTRANS)
1657 return HazardExpired;
1658
1659 // Instructions which cause va_vdst==0 expire hazard
1660 if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
1661 SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
1662 (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1663 I.getOperand(0).getImm() == 0x0fff))
1664 return HazardExpired;
1665
1666 // Track registers writes
1667 if (SIInstrInfo::isTRANS(I)) {
1668 for (Register Src : SrcVGPRs) {
1669 if (I.modifiesRegister(Src, &TRI)) {
1670 return HazardFound;
1671 }
1672 }
1673 }
1674
1675 return NoHazardFound;
1676 };
1677 auto UpdateStateFn = [](StateType &State, const MachineInstr &MI) {
1678 if (SIInstrInfo::isVALU(MI))
1679 State.VALUs += 1;
1680 if (SIInstrInfo::isTRANS(MI))
1681 State.TRANS += 1;
1682 };
1683
1684 DenseSet<const MachineBasicBlock *> Visited;
1685 if (!hasHazard<StateType>(State, IsHazardFn, UpdateStateFn, MI->getParent(),
1686 std::next(MI->getReverseIterator()), Visited))
1687 return false;
1688
1689 // Hazard is observed - insert a wait on va_dst counter to ensure hazard is
1690 // avoided (mask 0x0fff achieves this).
1691 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1692 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
1693 .addImm(0x0fff);
1694
1695 return true;
1696 }
1697
fixWMMAHazards(MachineInstr * MI)1698 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1699 if (!SIInstrInfo::isWMMA(*MI))
1700 return false;
1701
1702 const SIInstrInfo *TII = ST.getInstrInfo();
1703 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1704
1705 auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
1706 if (!SIInstrInfo::isWMMA(I))
1707 return false;
1708
1709 // Src0 or Src1 of the current wmma instruction overlaps with the dest of
1710 // the previous wmma.
1711 const Register CurSrc0Reg =
1712 TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
1713 const Register CurSrc1Reg =
1714 TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
1715
1716 const Register PrevDstReg =
1717 TII->getNamedOperand(I, AMDGPU::OpName::vdst)->getReg();
1718
1719 if (TRI->regsOverlap(PrevDstReg, CurSrc0Reg) ||
1720 TRI->regsOverlap(PrevDstReg, CurSrc1Reg)) {
1721 return true;
1722 }
1723
1724 // Src2 of the current wmma instruction overlaps with the dest of the
1725 // previous wmma.
1726 const MachineOperand *Src2 =
1727 TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
1728 const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
1729
1730 if (CurSrc2Reg != AMDGPU::NoRegister &&
1731 TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
1732
1733 const MachineOperand *Src2Mods =
1734 TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
1735 const bool NoSrc2Mods =
1736 (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
1737 // Exception: there is no hazard if the wmma instructions are of the same
1738 // type and there is no input modifier on src2 of the current instruction.
1739 return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
1740 TII->pseudoToMCOpcode(MI->getOpcode())));
1741 }
1742
1743 return false;
1744 };
1745
1746 auto IsExpiredFn = [](const MachineInstr &I, int) {
1747 return SIInstrInfo::isVALU(I);
1748 };
1749
1750 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1751 std::numeric_limits<int>::max())
1752 return false;
1753
1754 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
1755
1756 return true;
1757 }
1758
fixShift64HighRegBug(MachineInstr * MI)1759 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1760 if (!ST.hasShift64HighRegBug())
1761 return false;
1762
1763 switch (MI->getOpcode()) {
1764 default:
1765 return false;
1766 case AMDGPU::V_LSHLREV_B64_e64:
1767 case AMDGPU::V_LSHRREV_B64_e64:
1768 case AMDGPU::V_ASHRREV_I64_e64:
1769 break;
1770 }
1771
1772 MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1773 if (!Amt->isReg())
1774 return false;
1775
1776 Register AmtReg = Amt->getReg();
1777 const MachineRegisterInfo &MRI = MF.getRegInfo();
1778 // Check if this is a last VGPR in the allocation block.
1779 if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1780 return false;
1781
1782 if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1783 return false;
1784
1785 MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1786 bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1787 bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1788 bool Overlapped = OverlappedSrc || OverlappedDst;
1789
1790 assert(!OverlappedDst || !OverlappedSrc ||
1791 Src1->getReg() == MI->getOperand(0).getReg());
1792 assert(ST.needsAlignedVGPRs());
1793 static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1794
1795 Register NewReg;
1796 for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1797 : AMDGPU::VGPR_32RegClass) {
1798 if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1799 NewReg = Reg;
1800 break;
1801 }
1802 }
1803
1804 Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1805 : NewReg;
1806 Register NewAmtLo;
1807
1808 if (Overlapped)
1809 NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1810
1811 DebugLoc DL = MI->getDebugLoc();
1812 MachineBasicBlock *MBB = MI->getParent();
1813 // Insert a full wait count because found register might be pending a wait.
1814 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1815 .addImm(0);
1816
1817 // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1818 if (Overlapped)
1819 runOnInstruction(
1820 BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1821 .addDef(AmtReg - 1)
1822 .addReg(AmtReg - 1, RegState::Undef)
1823 .addReg(NewAmtLo, RegState::Undef));
1824 runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1825 .addDef(AmtReg)
1826 .addReg(AmtReg, RegState::Undef)
1827 .addReg(NewAmt, RegState::Undef));
1828
1829 // Instructions emitted after the current instruction will be processed by the
1830 // parent loop of the hazard recognizer in a natural way.
1831 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1832 AmtReg)
1833 .addDef(NewAmt)
1834 .addReg(NewAmt)
1835 .addReg(AmtReg);
1836 if (Overlapped)
1837 BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1838 AmtReg - 1)
1839 .addDef(NewAmtLo)
1840 .addReg(NewAmtLo)
1841 .addReg(AmtReg - 1);
1842
1843 // Re-running hazard recognizer on the modified instruction is not necessary,
1844 // inserted V_SWAP_B32 has already both read and write new registers so
1845 // hazards related to these register has already been handled.
1846 Amt->setReg(NewAmt);
1847 Amt->setIsKill(false);
1848 // We do not update liveness, so verifier may see it as undef.
1849 Amt->setIsUndef();
1850 if (OverlappedDst)
1851 MI->getOperand(0).setReg(NewReg);
1852 if (OverlappedSrc) {
1853 Src1->setReg(NewReg);
1854 Src1->setIsKill(false);
1855 Src1->setIsUndef();
1856 }
1857
1858 return true;
1859 }
1860
checkNSAtoVMEMHazard(MachineInstr * MI)1861 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1862 int NSAtoVMEMWaitStates = 1;
1863
1864 if (!ST.hasNSAtoVMEMBug())
1865 return 0;
1866
1867 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1868 return 0;
1869
1870 const SIInstrInfo *TII = ST.getInstrInfo();
1871 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1872 if (!Offset || (Offset->getImm() & 6) == 0)
1873 return 0;
1874
1875 auto IsHazardFn = [TII](const MachineInstr &I) {
1876 if (!SIInstrInfo::isMIMG(I))
1877 return false;
1878 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1879 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1880 TII->getInstSizeInBytes(I) >= 16;
1881 };
1882
1883 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1884 }
1885
checkFPAtomicToDenormModeHazard(MachineInstr * MI)1886 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1887 int FPAtomicToDenormModeWaitStates = 3;
1888
1889 if (!ST.hasFPAtomicToDenormModeHazard())
1890 return 0;
1891
1892 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1893 return 0;
1894
1895 auto IsHazardFn = [](const MachineInstr &I) {
1896 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1897 return false;
1898 return SIInstrInfo::isFPAtomic(I);
1899 };
1900
1901 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1902 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1903 return true;
1904
1905 switch (MI.getOpcode()) {
1906 case AMDGPU::S_WAITCNT:
1907 case AMDGPU::S_WAITCNT_VSCNT:
1908 case AMDGPU::S_WAITCNT_VMCNT:
1909 case AMDGPU::S_WAITCNT_EXPCNT:
1910 case AMDGPU::S_WAITCNT_LGKMCNT:
1911 case AMDGPU::S_WAIT_IDLE:
1912 return true;
1913 default:
1914 break;
1915 }
1916
1917 return false;
1918 };
1919
1920 return FPAtomicToDenormModeWaitStates -
1921 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1922 }
1923
checkMAIHazards(MachineInstr * MI)1924 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1925 assert(SIInstrInfo::isMAI(*MI));
1926
1927 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1928 }
1929
checkMFMAPadding(MachineInstr * MI)1930 int GCNHazardRecognizer::checkMFMAPadding(MachineInstr *MI) {
1931 // Early exit if no padding is requested.
1932 if (MFMAPaddingRatio == 0)
1933 return 0;
1934
1935 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1936 if (!SIInstrInfo::isMFMA(*MI) || MFI->getOccupancy() < 2)
1937 return 0;
1938
1939 int NeighborMFMALatency = 0;
1940 auto IsNeighboringMFMA = [&NeighborMFMALatency,
1941 this](const MachineInstr &MI) {
1942 if (!SIInstrInfo::isMFMA(MI))
1943 return false;
1944
1945 NeighborMFMALatency = this->getMFMAPipelineWaitStates(MI);
1946 return true;
1947 };
1948
1949 const int MaxMFMAPipelineWaitStates = 16;
1950 int WaitStatesSinceNeighborMFMA =
1951 getWaitStatesSince(IsNeighboringMFMA, MaxMFMAPipelineWaitStates);
1952
1953 int NeighborMFMAPaddingNeeded =
1954 (NeighborMFMALatency * MFMAPaddingRatio / 100) -
1955 WaitStatesSinceNeighborMFMA;
1956
1957 return std::max(0, NeighborMFMAPaddingNeeded);
1958 }
1959
checkMAIHazards908(MachineInstr * MI)1960 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1961 int WaitStatesNeeded = 0;
1962 unsigned Opc = MI->getOpcode();
1963
1964 auto IsVALUFn = [](const MachineInstr &MI) {
1965 return SIInstrInfo::isVALU(MI) || MI.isInlineAsm();
1966 };
1967
1968 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1969 const int LegacyVALUWritesVGPRWaitStates = 2;
1970 const int VALUWritesExecWaitStates = 4;
1971 const int MaxWaitStates = 4;
1972
1973 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1974 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1975 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1976
1977 if (WaitStatesNeeded < MaxWaitStates) {
1978 for (const MachineOperand &Use : MI->explicit_uses()) {
1979 const int MaxWaitStates = 2;
1980
1981 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1982 continue;
1983
1984 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1985 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1986 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1987
1988 if (WaitStatesNeeded == MaxWaitStates)
1989 break;
1990 }
1991 }
1992 }
1993
1994 for (const MachineOperand &Op : MI->explicit_operands()) {
1995 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1996 continue;
1997
1998 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1999 continue;
2000
2001 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
2002 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
2003 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
2004 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
2005 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
2006 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
2007 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
2008 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
2009 const int MaxWaitStates = 18;
2010 Register Reg = Op.getReg();
2011 unsigned HazardDefLatency = 0;
2012
2013 auto IsOverlappedMFMAFn = [Reg, &HazardDefLatency,
2014 this](const MachineInstr &MI) {
2015 if (!SIInstrInfo::isMFMA(MI))
2016 return false;
2017 Register DstReg = MI.getOperand(0).getReg();
2018 if (DstReg == Reg)
2019 return false;
2020 HazardDefLatency =
2021 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2022 return TRI.regsOverlap(DstReg, Reg);
2023 };
2024
2025 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
2026 MaxWaitStates);
2027 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
2028 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2029 int OpNo = MI->getOperandNo(&Op);
2030 if (OpNo == SrcCIdx) {
2031 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
2032 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
2033 switch (HazardDefLatency) {
2034 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
2035 break;
2036 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
2037 break;
2038 case 16: [[fallthrough]];
2039 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
2040 break;
2041 }
2042 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2043 switch (HazardDefLatency) {
2044 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
2045 break;
2046 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
2047 break;
2048 case 16: [[fallthrough]];
2049 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
2050 break;
2051 }
2052 }
2053
2054 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2055 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2056
2057 if (WaitStatesNeeded == MaxWaitStates)
2058 return WaitStatesNeeded; // Early exit.
2059
2060 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
2061 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2062 return false;
2063 Register DstReg = MI.getOperand(0).getReg();
2064 return TRI.regsOverlap(Reg, DstReg);
2065 };
2066
2067 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
2068 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
2069 const int AccVGPRWriteAccVgprReadWaitStates = 3;
2070 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
2071 if (OpNo == SrcCIdx)
2072 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
2073 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
2074 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
2075
2076 WaitStatesNeededForUse = NeedWaitStates -
2077 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
2078 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2079
2080 if (WaitStatesNeeded == MaxWaitStates)
2081 return WaitStatesNeeded; // Early exit.
2082 }
2083
2084 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
2085 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
2086 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
2087 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
2088 const int MaxWaitStates = 13;
2089 Register DstReg = MI->getOperand(0).getReg();
2090 unsigned HazardDefLatency = 0;
2091
2092 auto IsSrcCMFMAFn = [DstReg, &HazardDefLatency,
2093 this](const MachineInstr &MI) {
2094 if (!SIInstrInfo::isMFMA(MI))
2095 return false;
2096 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
2097 HazardDefLatency =
2098 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
2099 return TRI.regsOverlap(Reg, DstReg);
2100 };
2101
2102 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
2103 int NeedWaitStates;
2104 switch (HazardDefLatency) {
2105 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
2106 break;
2107 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
2108 break;
2109 case 16: [[fallthrough]];
2110 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
2111 break;
2112 }
2113
2114 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
2115 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2116 }
2117
2118 // Pad neighboring MFMA with noops for better inter-wave performance.
2119 WaitStatesNeeded = std::max(WaitStatesNeeded, checkMFMAPadding(MI));
2120
2121 return WaitStatesNeeded;
2122 }
2123
checkMAIHazards90A(MachineInstr * MI)2124 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
2125 int WaitStatesNeeded = 0;
2126 unsigned Opc = MI->getOpcode();
2127
2128 auto IsLegacyVALUFn = [](const MachineInstr &MI) {
2129 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI);
2130 };
2131
2132 auto IsLegacyVALUNotDotFn = [](const MachineInstr &MI) {
2133 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMFMA(MI) &&
2134 !SIInstrInfo::isDOT(MI);
2135 };
2136
2137 if (!SIInstrInfo::isMFMA(*MI))
2138 return WaitStatesNeeded;
2139
2140 const int VALUWritesExecWaitStates = 4;
2141 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
2142 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
2143 VALUWritesExecWaitStates);
2144 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2145
2146 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
2147
2148 // Loop for both DGEMM and S/HGEMM 2nd instruction.
2149 for (const MachineOperand &Use : MI->explicit_uses()) {
2150 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
2151 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
2152 const int GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates = 3;
2153 const int GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates = 5;
2154 const int GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates = 4;
2155 const int GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates = 9;
2156 const int GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates = 8;
2157 const int GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates = 17;
2158 const int GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates = 16;
2159 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
2160 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
2161 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
2162 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
2163 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
2164 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
2165 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
2166 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
2167 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
2168 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
2169 const int GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates = 4;
2170 const int GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates = 6;
2171 const int GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates = 10;
2172 const int GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates = 18;
2173 const int GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates = 5;
2174 const int GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates = 7;
2175 const int GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates = 11;
2176 const int GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates = 19;
2177 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
2178 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
2179 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
2180 const int GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates = 2;
2181 const int MaxWaitStates = 19;
2182
2183 if (!Use.isReg())
2184 continue;
2185 Register Reg = Use.getReg();
2186 bool FullReg;
2187 const MachineInstr *MI1;
2188
2189 auto IsOverlappedMFMAFn = [Reg, &FullReg, &MI1,
2190 this](const MachineInstr &MI) {
2191 if (!SIInstrInfo::isMFMA(MI))
2192 return false;
2193 Register DstReg = MI.getOperand(0).getReg();
2194 FullReg = (DstReg == Reg);
2195 MI1 = &MI;
2196 return TRI.regsOverlap(DstReg, Reg);
2197 };
2198
2199 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
2200 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
2201 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2202
2203 int NumWaitStates =
2204 getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn, MaxWaitStates);
2205 if (NumWaitStates == std::numeric_limits<int>::max())
2206 continue;
2207
2208 int OpNo = MI->getOperandNo(&Use);
2209 unsigned Opc1 = MI1->getOpcode();
2210 int NeedWaitStates = 0;
2211 if (OpNo == SrcCIdx) {
2212 if (!isDGEMM(Opc) && (!ST.hasGFX940Insts() && isDGEMM(Opc1))) {
2213 NeedWaitStates = 0;
2214 } else if (FullReg) {
2215 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2216 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
2217 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
2218 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
2219 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
2220 else if (ST.hasGFX940Insts() &&
2221 TSchedModel.computeInstrLatency(MI1) == 2)
2222 NeedWaitStates = GFX940_SMFMA4x4WritesVGPRFullSrcCWaitStates;
2223 } else {
2224 switch (Opc1) {
2225 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2226 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2227 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2228 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2229 if (!isXDL(ST, *MI))
2230 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
2231 break;
2232 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2233 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2234 if (!isXDL(ST, *MI))
2235 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
2236 break;
2237 default:
2238 if (ST.hasGFX940Insts() && isXDL(ST, *MI) && !isXDL(ST, *MI1))
2239 break;
2240 switch (TSchedModel.computeInstrLatency(MI1)) {
2241 case 2:
2242 NeedWaitStates = ST.hasGFX940Insts()
2243 ? isXDL(ST, *MI1)
2244 ? GFX940_XDL2PassWritesVGPROverlappedSMFMASrcCWaitStates
2245 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates
2246 : isDGEMM(Opc)
2247 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
2248 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
2249 break;
2250 case 4:
2251 assert(ST.hasGFX940Insts());
2252 NeedWaitStates = isXDL(ST, *MI1)
2253 ? GFX940_XDL4PassWritesVGPROverlappedSMFMASrcCWaitStates
2254 : GFX940_SMFMA4PassWritesVGPROverlappedSMFMASrcCWaitStates;
2255 break;
2256 case 8:
2257 NeedWaitStates = ST.hasGFX940Insts()
2258 ? isXDL(ST, *MI1)
2259 ? GFX940_XDL8PassWritesVGPROverlappedSMFMASrcCWaitStates
2260 : GFX940_SMFMA8PassWritesVGPROverlappedSMFMASrcCWaitStates
2261 : isDGEMM(Opc)
2262 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
2263 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
2264 break;
2265 case 16: [[fallthrough]];
2266 default:
2267 NeedWaitStates = ST.hasGFX940Insts()
2268 ? isXDL(ST, *MI1)
2269 ? GFX940_XDL16PassWritesVGPROverlappedSMFMASrcCWaitStates
2270 : GFX940_SMFMA16PassWritesVGPROverlappedSMFMASrcCWaitStates
2271 : isDGEMM(Opc)
2272 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
2273 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
2274 }
2275 }
2276 }
2277 } else {
2278 switch (Opc1) {
2279 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
2280 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
2281 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_e64:
2282 case AMDGPU::V_MFMA_F64_16X16X4F64_mac_vgprcd_e64:
2283 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
2284 break;
2285 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
2286 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
2287 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
2288 break;
2289 default:
2290 switch (TSchedModel.computeInstrLatency(MI1)) {
2291 case 2:
2292 NeedWaitStates = ST.hasGFX940Insts()
2293 ? isXDL(ST, *MI1)
2294 ? GFX940_XDL2PassWritesVGPROverlappedSrcABWaitStates
2295 : GFX940_SMFMA2PassWritesVGPROverlappedSrcABWaitStates
2296 : SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
2297 break;
2298 case 4:
2299 assert(ST.hasGFX940Insts());
2300 NeedWaitStates = isXDL(ST, *MI1)
2301 ? GFX940_XDL4PassWritesVGPROverlappedSrcABWaitStates
2302 : GFX940_SMFMA4PassWritesVGPROverlappedSrcABWaitStates;
2303 break;
2304 case 8:
2305 NeedWaitStates = ST.hasGFX940Insts()
2306 ? isXDL(ST, *MI1)
2307 ? GFX940_XDL8PassWritesVGPROverlappedSrcABWaitStates
2308 : GFX940_SMFMA8PassWritesVGPROverlappedSrcABWaitStates
2309 : SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
2310 break;
2311 case 16: [[fallthrough]];
2312 default:
2313 NeedWaitStates = ST.hasGFX940Insts()
2314 ? isXDL(ST, *MI1)
2315 ? GFX940_XDL16PassWritesVGPROverlappedSrcABWaitStates
2316 : GFX940_SMFMA16PassWritesVGPROverlappedSrcABWaitStates
2317 : SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
2318 }
2319 }
2320 }
2321 if (WaitStatesNeeded >= NeedWaitStates)
2322 continue;
2323
2324 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
2325 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2326
2327 if (WaitStatesNeeded == MaxWaitStates)
2328 break;
2329 }
2330
2331 return WaitStatesNeeded;
2332 }
2333
checkMAILdStHazards(MachineInstr * MI)2334 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
2335 // On gfx90a+ relevant hazards are checked in checkMAIVALUHazards()
2336 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
2337 return 0;
2338
2339 int WaitStatesNeeded = 0;
2340
2341 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
2342 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
2343 };
2344
2345 for (const MachineOperand &Op : MI->explicit_uses()) {
2346 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
2347 continue;
2348
2349 Register Reg = Op.getReg();
2350
2351 const int AccVgprReadLdStWaitStates = 2;
2352 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
2353 const int MaxWaitStates = 2;
2354
2355 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
2356 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
2357 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2358
2359 if (WaitStatesNeeded == MaxWaitStates)
2360 return WaitStatesNeeded; // Early exit.
2361
2362 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
2363 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
2364 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
2365 return false;
2366 auto IsVALUFn = [](const MachineInstr &MI) {
2367 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
2368 };
2369 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
2370 std::numeric_limits<int>::max();
2371 };
2372
2373 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
2374 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
2375 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2376 }
2377
2378 return WaitStatesNeeded;
2379 }
2380
checkMAIVALUHazards(MachineInstr * MI)2381 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
2382 if (!ST.hasGFX90AInsts())
2383 return 0;
2384
2385 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
2386 return isDGEMM(MI.getOpcode());
2387 };
2388
2389 // This is checked in checkMAIHazards90A()
2390 if (SIInstrInfo::isMFMA(*MI))
2391 return 0;
2392
2393 const MachineRegisterInfo &MRI = MF.getRegInfo();
2394
2395 int WaitStatesNeeded = 0;
2396
2397 bool IsMem = SIInstrInfo::isVMEM(*MI) ||
2398 SIInstrInfo::isFLAT(*MI) ||
2399 SIInstrInfo::isDS(*MI);
2400 bool IsMemOrExport = IsMem || SIInstrInfo::isEXP(*MI);
2401 bool IsVALU = SIInstrInfo::isVALU(*MI);
2402
2403 const MachineInstr *MFMA = nullptr;
2404 unsigned Reg;
2405 auto IsMFMAWriteFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2406 if (!SIInstrInfo::isMFMA(MI) ||
2407 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2408 return false;
2409 MFMA = &MI;
2410 return true;
2411 };
2412
2413 const MachineInstr *DOT = nullptr;
2414 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
2415 if (!SIInstrInfo::isDOT(MI) ||
2416 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
2417 return false;
2418 DOT = &MI;
2419 return true;
2420 };
2421
2422 bool DGEMMAfterVALUWrite = false;
2423 auto IsDGEMMHazard = [&DGEMMAfterVALUWrite, this](const MachineInstr &MI) {
2424 // Found DGEMM on reverse traversal to def.
2425 if (isDGEMM(MI.getOpcode()))
2426 DGEMMAfterVALUWrite = true;
2427
2428 // Only hazard if register is defined by a VALU and a DGEMM is found after
2429 // after the def.
2430 if (!TII.isVALU(MI) || !DGEMMAfterVALUWrite)
2431 return false;
2432
2433 return true;
2434 };
2435
2436 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
2437 AMDGPU::OpName::src2);
2438
2439 if (IsMemOrExport || IsVALU) {
2440 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
2441 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
2442 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
2443 const int GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates = 4;
2444 const int GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates = 6;
2445 const int GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates = 10;
2446 const int GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates = 18;
2447 const int GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates = 5;
2448 const int GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates = 7;
2449 const int GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates = 11;
2450 const int GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates = 19;
2451 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
2452 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
2453 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
2454 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
2455 const int DotWriteSameDotReadSrcAB = 3;
2456 const int DotWriteDifferentVALURead = 3;
2457 const int DMFMABetweenVALUWriteVMEMRead = 2;
2458 const int MaxWaitStates = 19;
2459
2460 for (const MachineOperand &Use : MI->explicit_uses()) {
2461 if (!Use.isReg())
2462 continue;
2463 Reg = Use.getReg();
2464
2465 DOT = nullptr;
2466 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2467 MaxWaitStates);
2468 if (DOT) {
2469 int NeedWaitStates = 0;
2470 if (DOT->getOpcode() == MI->getOpcode()) {
2471 if (&Use - &MI->getOperand(0) != SrcCIdx)
2472 NeedWaitStates = DotWriteSameDotReadSrcAB;
2473 } else {
2474 NeedWaitStates = DotWriteDifferentVALURead;
2475 }
2476
2477 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2478 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2479 }
2480
2481 // Workaround for HW data hazard bug observed only in GFX90A. When there
2482 // is a DGEMM instruction in-between a VALU and a VMEM instruction it
2483 // causes the SQ to incorrectly not insert two wait states between the two
2484 // instructions needed to avoid data hazard.
2485 if (IsMem && ST.hasGFX90AInsts() && !ST.hasGFX940Insts()) {
2486 DGEMMAfterVALUWrite = false;
2487 if (TRI.isVectorRegister(MRI, Reg)) {
2488 int WaitStatesNeededForUse =
2489 DMFMABetweenVALUWriteVMEMRead -
2490 getWaitStatesSinceDef(Reg, IsDGEMMHazard,
2491 DMFMABetweenVALUWriteVMEMRead);
2492
2493 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2494 }
2495 }
2496
2497 MFMA = nullptr;
2498 WaitStatesSinceDef =
2499 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2500 if (!MFMA)
2501 continue;
2502
2503 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2504 int NeedWaitStates = MaxWaitStates;
2505 switch (HazardDefLatency) {
2506 case 2:
2507 NeedWaitStates =
2508 ST.hasGFX940Insts()
2509 ? isXDL(ST, *MFMA)
2510 ? GFX940_XDL2PassWriteVgprVALUMemExpReadWaitStates
2511 : GFX940_SMFMA2PassWriteVgprVALUMemExpReadWaitStates
2512 : SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
2513 break;
2514 case 4:
2515 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2516 NeedWaitStates =
2517 isDGEMM(MFMA->getOpcode())
2518 ? IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
2519 : DMFMA4x4WriteVgprVALUReadWaitStates
2520 : isXDL(ST, *MFMA)
2521 ? GFX940_XDL4PassWriteVgprVALUMemExpReadWaitStates
2522 : GFX940_SMFMA4PassWriteVgprVALUMemExpReadWaitStates;
2523 break;
2524 case 8:
2525 NeedWaitStates =
2526 ST.hasGFX940Insts()
2527 ? isXDL(ST, *MFMA)
2528 ? GFX940_XDL8PassWriteVgprVALUMemExpReadWaitStates
2529 : GFX940_SMFMA8PassWriteVgprVALUMemExpReadWaitStates
2530 : SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
2531 break;
2532 case 16: [[fallthrough]];
2533 default:
2534 NeedWaitStates =
2535 isDGEMM(MFMA->getOpcode())
2536 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
2537 : DMFMA16x16WriteVgprVALUReadWaitStates
2538 : ST.hasGFX940Insts()
2539 ? isXDL(ST, *MFMA)
2540 ? GFX940_XDL16PassWriteVgprVALUMemExpReadWaitStates
2541 : GFX940_SMFMA16PassWriteVgprVALUMemExpReadWaitStates
2542 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
2543 break;
2544 }
2545
2546 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2547 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2548
2549 if (WaitStatesNeeded == MaxWaitStates)
2550 break;
2551 }
2552 }
2553
2554 unsigned Opc = MI->getOpcode();
2555 const int DMFMAToFMA64WaitStates = 2;
2556 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
2557 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
2558 Opc == AMDGPU::V_FMAC_F64_dpp) &&
2559 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
2560 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
2561 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
2562 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2563 }
2564
2565 if (!IsVALU && !IsMemOrExport)
2566 return WaitStatesNeeded;
2567
2568 for (const MachineOperand &Def : MI->defs()) {
2569 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
2570 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
2571 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
2572 const int GFX940_SMFMA2PassWriteVgprVALUWawWaitStates = 4;
2573 const int GFX940_SMFMA4PassWriteVgprVALUWawWaitStates = 6;
2574 const int GFX940_SMFMA8PassWriteVgprVALUWawWaitStates = 10;
2575 const int GFX940_SMFMA16PassWriteVgprVALUWawWaitStates = 18;
2576 const int GFX940_XDL2PassWriteVgprVALUWawWaitStates = 5;
2577 const int GFX940_XDL4PassWriteVgprVALUWawWaitStates = 7;
2578 const int GFX940_XDL8PassWriteVgprVALUWawWaitStates = 11;
2579 const int GFX940_XDL16PassWriteVgprVALUWawWaitStates = 19;
2580 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
2581 const int GFX940_XDL4PassReadVgprVALUWarWaitStates = 3;
2582 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
2583 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
2584 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
2585 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
2586 const int DotWriteDifferentVALUWrite = 3;
2587 const int MaxWaitStates = 19;
2588 const int MaxWarWaitStates = 15;
2589
2590 Reg = Def.getReg();
2591
2592 DOT = nullptr;
2593 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
2594 MaxWaitStates);
2595 if (DOT && DOT->getOpcode() != MI->getOpcode())
2596 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
2597 WaitStatesSinceDef);
2598
2599 MFMA = nullptr;
2600 WaitStatesSinceDef =
2601 getWaitStatesSinceDef(Reg, IsMFMAWriteFn, MaxWaitStates);
2602 if (MFMA) {
2603 int NeedWaitStates = MaxWaitStates;
2604 switch (TSchedModel.computeInstrLatency(MFMA)) {
2605 case 2:
2606 NeedWaitStates = ST.hasGFX940Insts()
2607 ? isXDL(ST, *MFMA)
2608 ? GFX940_XDL2PassWriteVgprVALUWawWaitStates
2609 : GFX940_SMFMA2PassWriteVgprVALUWawWaitStates
2610 : SMFMA4x4WriteVgprVALUWawWaitStates;
2611 break;
2612 case 4:
2613 assert(isDGEMM(MFMA->getOpcode()) || ST.hasGFX940Insts());
2614 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2615 ? DMFMA4x4WriteVgprVALUWriteWaitStates
2616 : isXDL(ST, *MFMA)
2617 ? GFX940_XDL4PassWriteVgprVALUWawWaitStates
2618 : GFX940_SMFMA4PassWriteVgprVALUWawWaitStates;
2619 break;
2620 case 8:
2621 NeedWaitStates = ST.hasGFX940Insts()
2622 ? isXDL(ST, *MFMA)
2623 ? GFX940_XDL8PassWriteVgprVALUWawWaitStates
2624 : GFX940_SMFMA8PassWriteVgprVALUWawWaitStates
2625 : SMFMA16x16WriteVgprVALUWawWaitStates;
2626 break;
2627 case 16: [[fallthrough]];
2628 default:
2629 NeedWaitStates = isDGEMM(MFMA->getOpcode())
2630 ? DMFMA16x16WriteVgprVALUWriteWaitStates
2631 : ST.hasGFX940Insts()
2632 ? isXDL(ST, *MFMA)
2633 ? GFX940_XDL16PassWriteVgprVALUWawWaitStates
2634 : GFX940_SMFMA16PassWriteVgprVALUWawWaitStates
2635 : SMFMA32x32WriteVgprVALUWawWaitStates;
2636 break;
2637 }
2638
2639 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
2640 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2641
2642 if (WaitStatesNeeded == MaxWaitStates)
2643 break;
2644 }
2645
2646 auto IsSMFMAReadAsCFn = [&Reg, &MFMA, this](const MachineInstr &MI) {
2647 if (!SIInstrInfo::isMFMA(MI) || isDGEMM(MI.getOpcode()) ||
2648 !MI.readsRegister(Reg, &TRI))
2649 return false;
2650
2651 if (ST.hasGFX940Insts() && !isXDL(ST, MI))
2652 return false;
2653
2654 const MachineOperand *SrcC =
2655 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
2656 assert(SrcC);
2657 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
2658 return false;
2659
2660 MFMA = &MI;
2661 return true;
2662 };
2663
2664 MFMA = nullptr;
2665 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
2666 MaxWarWaitStates);
2667 if (!MFMA)
2668 continue;
2669
2670 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
2671 int NeedWaitStates = MaxWaitStates;
2672 switch (HazardDefLatency) {
2673 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
2674 break;
2675 case 4: assert(ST.hasGFX940Insts());
2676 NeedWaitStates = GFX940_XDL4PassReadVgprVALUWarWaitStates;
2677 break;
2678 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
2679 break;
2680 case 16: [[fallthrough]];
2681 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
2682 break;
2683 }
2684
2685 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
2686 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
2687 }
2688
2689 return WaitStatesNeeded;
2690 }
2691
ShouldPreferAnother(SUnit * SU)2692 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
2693 if (!SU->isInstr())
2694 return false;
2695
2696 const MachineInstr *MAI = nullptr;
2697
2698 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
2699 MAI = nullptr;
2700 if (SIInstrInfo::isMFMA(MI))
2701 MAI = &MI;
2702 return MAI != nullptr;
2703 };
2704
2705 MachineInstr *MI = SU->getInstr();
2706 if (IsMFMAFn(*MI)) {
2707 int W = getWaitStatesSince(IsMFMAFn, 16);
2708 if (MAI)
2709 return W < (int)TSchedModel.computeInstrLatency(MAI);
2710 }
2711
2712 return false;
2713 }
2714
fixVALUMaskWriteHazard(MachineInstr * MI)2715 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
2716 if (!ST.isWave64())
2717 return false;
2718 if (!ST.hasVALUMaskWriteHazard())
2719 return false;
2720 if (!SIInstrInfo::isSALU(*MI))
2721 return false;
2722
2723 // The hazard sequence is three instructions:
2724 // 1. VALU reads SGPR as mask
2725 // 2. SALU writes SGPR
2726 // 3. SALU reads SGPR
2727 // The hazard can expire if the distance between 2 and 3 is sufficient.
2728 // In practice this happens <10% of the time, hence this always assumes
2729 // the hazard exists if 1 and 2 are present to avoid searching.
2730
2731 const MachineOperand *SDSTOp = TII.getNamedOperand(*MI, AMDGPU::OpName::sdst);
2732 if (!SDSTOp || !SDSTOp->isReg())
2733 return false;
2734
2735 const Register HazardReg = SDSTOp->getReg();
2736 if (HazardReg == AMDGPU::EXEC ||
2737 HazardReg == AMDGPU::EXEC_LO ||
2738 HazardReg == AMDGPU::EXEC_HI ||
2739 HazardReg == AMDGPU::M0)
2740 return false;
2741
2742 auto IsHazardFn = [HazardReg, this](const MachineInstr &I) {
2743 switch (I.getOpcode()) {
2744 case AMDGPU::V_ADDC_U32_e32:
2745 case AMDGPU::V_ADDC_U32_dpp:
2746 case AMDGPU::V_CNDMASK_B16_e32:
2747 case AMDGPU::V_CNDMASK_B16_dpp:
2748 case AMDGPU::V_CNDMASK_B32_e32:
2749 case AMDGPU::V_CNDMASK_B32_dpp:
2750 case AMDGPU::V_DIV_FMAS_F32_e64:
2751 case AMDGPU::V_DIV_FMAS_F64_e64:
2752 case AMDGPU::V_SUBB_U32_e32:
2753 case AMDGPU::V_SUBB_U32_dpp:
2754 case AMDGPU::V_SUBBREV_U32_e32:
2755 case AMDGPU::V_SUBBREV_U32_dpp:
2756 // These implicitly read VCC as mask source.
2757 return HazardReg == AMDGPU::VCC ||
2758 HazardReg == AMDGPU::VCC_LO ||
2759 HazardReg == AMDGPU::VCC_HI;
2760 case AMDGPU::V_ADDC_U32_e64:
2761 case AMDGPU::V_ADDC_U32_e64_dpp:
2762 case AMDGPU::V_CNDMASK_B16_e64:
2763 case AMDGPU::V_CNDMASK_B16_e64_dpp:
2764 case AMDGPU::V_CNDMASK_B32_e64:
2765 case AMDGPU::V_CNDMASK_B32_e64_dpp:
2766 case AMDGPU::V_SUBB_U32_e64:
2767 case AMDGPU::V_SUBB_U32_e64_dpp:
2768 case AMDGPU::V_SUBBREV_U32_e64:
2769 case AMDGPU::V_SUBBREV_U32_e64_dpp: {
2770 // Only check mask register overlaps.
2771 const MachineOperand *SSRCOp = TII.getNamedOperand(I, AMDGPU::OpName::src2);
2772 assert(SSRCOp);
2773 return TRI.regsOverlap(SSRCOp->getReg(), HazardReg);
2774 }
2775 default:
2776 return false;
2777 }
2778 };
2779
2780 const MachineRegisterInfo &MRI = MF.getRegInfo();
2781 auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
2782 // s_waitcnt_depctr sa_sdst(0) mitigates hazard.
2783 if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
2784 !(I.getOperand(0).getImm() & 0x1))
2785 return true;
2786
2787 // VALU access to any SGPR or literal constant other than HazardReg
2788 // mitigates hazard. No need to check HazardReg here as this will
2789 // only be called when !IsHazardFn.
2790 if (!SIInstrInfo::isVALU(I))
2791 return false;
2792 for (int OpNo = 0, End = I.getNumOperands(); OpNo < End; ++OpNo) {
2793 const MachineOperand &Op = I.getOperand(OpNo);
2794 if (Op.isReg()) {
2795 Register OpReg = Op.getReg();
2796 // Only consider uses
2797 if (!Op.isUse())
2798 continue;
2799 // Ignore EXEC
2800 if (OpReg == AMDGPU::EXEC ||
2801 OpReg == AMDGPU::EXEC_LO ||
2802 OpReg == AMDGPU::EXEC_HI)
2803 continue;
2804 // Ignore all implicit uses except VCC
2805 if (Op.isImplicit()) {
2806 if (OpReg == AMDGPU::VCC ||
2807 OpReg == AMDGPU::VCC_LO ||
2808 OpReg == AMDGPU::VCC_HI)
2809 return true;
2810 continue;
2811 }
2812 if (TRI.isSGPRReg(MRI, OpReg))
2813 return true;
2814 } else {
2815 const MCInstrDesc &InstDesc = I.getDesc();
2816 const MCOperandInfo &OpInfo = InstDesc.operands()[OpNo];
2817 if (!TII.isInlineConstant(Op, OpInfo))
2818 return true;
2819 }
2820 }
2821 return false;
2822 };
2823
2824 // Check for hazard
2825 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
2826 std::numeric_limits<int>::max())
2827 return false;
2828
2829 auto NextMI = std::next(MI->getIterator());
2830
2831 // Add s_waitcnt_depctr sa_sdst(0) after SALU write.
2832 BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
2833 TII.get(AMDGPU::S_WAITCNT_DEPCTR))
2834 .addImm(0xfffe);
2835
2836 // SALU write may be s_getpc in a bundle.
2837 if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
2838 // Update offsets of any references in the bundle.
2839 while (NextMI != MI->getParent()->end() &&
2840 NextMI->isBundledWithPred()) {
2841 for (auto &Operand : NextMI->operands()) {
2842 if (Operand.isGlobal())
2843 Operand.setOffset(Operand.getOffset() + 4);
2844 }
2845 NextMI++;
2846 }
2847 }
2848
2849 return true;
2850 }
2851