1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "GCNHazardRecognizer.h"
14 #include "GCNSubtarget.h"
15 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
16 #include "llvm/CodeGen/MachineFunction.h"
17 #include "llvm/CodeGen/ScheduleDAG.h"
18 #include "llvm/Support/TargetParser.h"
19
20 using namespace llvm;
21
22 //===----------------------------------------------------------------------===//
23 // Hazard Recoginizer Implementation
24 //===----------------------------------------------------------------------===//
25
26 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
27 const GCNSubtarget &ST);
28
GCNHazardRecognizer(const MachineFunction & MF)29 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
30 IsHazardRecognizerMode(false),
31 CurrCycleInstr(nullptr),
32 MF(MF),
33 ST(MF.getSubtarget<GCNSubtarget>()),
34 TII(*ST.getInstrInfo()),
35 TRI(TII.getRegisterInfo()),
36 ClauseUses(TRI.getNumRegUnits()),
37 ClauseDefs(TRI.getNumRegUnits()) {
38 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 19 : 5;
39 TSchedModel.init(&ST);
40 RunLdsBranchVmemWARHazardFixup = shouldRunLdsBranchVmemWARHazardFixup(MF, ST);
41 }
42
Reset()43 void GCNHazardRecognizer::Reset() {
44 EmittedInstrs.clear();
45 }
46
EmitInstruction(SUnit * SU)47 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
48 EmitInstruction(SU->getInstr());
49 }
50
EmitInstruction(MachineInstr * MI)51 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
52 CurrCycleInstr = MI;
53 }
54
isDivFMas(unsigned Opcode)55 static bool isDivFMas(unsigned Opcode) {
56 return Opcode == AMDGPU::V_DIV_FMAS_F32_e64 || Opcode == AMDGPU::V_DIV_FMAS_F64_e64;
57 }
58
isSGetReg(unsigned Opcode)59 static bool isSGetReg(unsigned Opcode) {
60 return Opcode == AMDGPU::S_GETREG_B32;
61 }
62
isSSetReg(unsigned Opcode)63 static bool isSSetReg(unsigned Opcode) {
64 switch (Opcode) {
65 case AMDGPU::S_SETREG_B32:
66 case AMDGPU::S_SETREG_B32_mode:
67 case AMDGPU::S_SETREG_IMM32_B32:
68 case AMDGPU::S_SETREG_IMM32_B32_mode:
69 return true;
70 }
71 return false;
72 }
73
isRWLane(unsigned Opcode)74 static bool isRWLane(unsigned Opcode) {
75 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
76 }
77
isRFE(unsigned Opcode)78 static bool isRFE(unsigned Opcode) {
79 return Opcode == AMDGPU::S_RFE_B64;
80 }
81
isSMovRel(unsigned Opcode)82 static bool isSMovRel(unsigned Opcode) {
83 switch (Opcode) {
84 case AMDGPU::S_MOVRELS_B32:
85 case AMDGPU::S_MOVRELS_B64:
86 case AMDGPU::S_MOVRELD_B32:
87 case AMDGPU::S_MOVRELD_B64:
88 return true;
89 default:
90 return false;
91 }
92 }
93
isDGEMM(unsigned Opcode)94 static bool isDGEMM(unsigned Opcode) {
95 return Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
96 Opcode == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64 ||
97 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_e64 ||
98 Opcode == AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64;
99 }
100
isXDL(const GCNSubtarget & ST,const MachineInstr & MI)101 static bool isXDL(const GCNSubtarget &ST, const MachineInstr &MI) {
102 unsigned Opcode = MI.getOpcode();
103
104 if (!SIInstrInfo::isMAI(MI) ||
105 isDGEMM(Opcode) ||
106 Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
107 Opcode == AMDGPU::V_ACCVGPR_READ_B32_e64)
108 return false;
109
110 return true;
111 }
112
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)113 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
114 const MachineInstr &MI) {
115 if (TII.isAlwaysGDS(MI.getOpcode()))
116 return true;
117
118 switch (MI.getOpcode()) {
119 case AMDGPU::S_SENDMSG:
120 case AMDGPU::S_SENDMSGHALT:
121 case AMDGPU::S_TTRACEDATA:
122 return true;
123 // These DS opcodes don't support GDS.
124 case AMDGPU::DS_NOP:
125 case AMDGPU::DS_PERMUTE_B32:
126 case AMDGPU::DS_BPERMUTE_B32:
127 return false;
128 default:
129 if (TII.isDS(MI.getOpcode())) {
130 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
131 AMDGPU::OpName::gds);
132 if (MI.getOperand(GDS).getImm())
133 return true;
134 }
135 return false;
136 }
137 }
138
isPermlane(const MachineInstr & MI)139 static bool isPermlane(const MachineInstr &MI) {
140 unsigned Opcode = MI.getOpcode();
141 return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
142 Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
143 }
144
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)145 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
146 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
147 AMDGPU::OpName::simm16);
148 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
149 }
150
151 ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)152 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
153 MachineInstr *MI = SU->getInstr();
154 // If we are not in "HazardRecognizerMode" and therefore not being run from
155 // the scheduler, track possible stalls from hazards but don't insert noops.
156 auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard;
157
158 if (MI->isBundle())
159 return NoHazard;
160
161 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
162 return HazardType;
163
164 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
165 return HazardType;
166
167 if (checkFPAtomicToDenormModeHazard(MI) > 0)
168 return HazardType;
169
170 if (ST.hasNoDataDepHazard())
171 return NoHazard;
172
173 // FIXME: Should flat be considered vmem?
174 if ((SIInstrInfo::isVMEM(*MI) ||
175 SIInstrInfo::isFLAT(*MI))
176 && checkVMEMHazards(MI) > 0)
177 return HazardType;
178
179 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
180 return HazardType;
181
182 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
183 return HazardType;
184
185 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
186 return HazardType;
187
188 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
189 return HazardType;
190
191 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
192 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
193 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
194 return HazardType;
195
196 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
197 return HazardType;
198
199 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
200 return HazardType;
201
202 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
203 return HazardType;
204
205 if (ST.hasReadM0MovRelInterpHazard() &&
206 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
207 checkReadM0Hazards(MI) > 0)
208 return HazardType;
209
210 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
211 checkReadM0Hazards(MI) > 0)
212 return HazardType;
213
214 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
215 return HazardType;
216
217 if ((SIInstrInfo::isVMEM(*MI) ||
218 SIInstrInfo::isFLAT(*MI) ||
219 SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0)
220 return HazardType;
221
222 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
223 return HazardType;
224
225 return NoHazard;
226 }
227
insertNoopsInBundle(MachineInstr * MI,const SIInstrInfo & TII,unsigned Quantity)228 static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII,
229 unsigned Quantity) {
230 while (Quantity > 0) {
231 unsigned Arg = std::min(Quantity, 8u);
232 Quantity -= Arg;
233 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
234 .addImm(Arg - 1);
235 }
236 }
237
processBundle()238 void GCNHazardRecognizer::processBundle() {
239 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
240 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
241 // Check bundled MachineInstr's for hazards.
242 for (; MI != E && MI->isInsideBundle(); ++MI) {
243 CurrCycleInstr = &*MI;
244 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
245
246 if (IsHazardRecognizerMode) {
247 fixHazards(CurrCycleInstr);
248
249 insertNoopsInBundle(CurrCycleInstr, TII, WaitStates);
250 }
251
252 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
253 // include the bundled MI directly after, only add a maximum of
254 // (MaxLookAhead - 1) noops to EmittedInstrs.
255 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
256 EmittedInstrs.push_front(nullptr);
257
258 EmittedInstrs.push_front(CurrCycleInstr);
259 EmittedInstrs.resize(MaxLookAhead);
260 }
261 CurrCycleInstr = nullptr;
262 }
263
PreEmitNoops(MachineInstr * MI)264 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
265 IsHazardRecognizerMode = true;
266 CurrCycleInstr = MI;
267 unsigned W = PreEmitNoopsCommon(MI);
268 fixHazards(MI);
269 CurrCycleInstr = nullptr;
270 return W;
271 }
272
PreEmitNoopsCommon(MachineInstr * MI)273 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
274 if (MI->isBundle())
275 return 0;
276
277 int WaitStates = 0;
278
279 if (SIInstrInfo::isSMRD(*MI))
280 return std::max(WaitStates, checkSMRDHazards(MI));
281
282 if (ST.hasNSAtoVMEMBug())
283 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
284
285 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
286
287 if (ST.hasNoDataDepHazard())
288 return WaitStates;
289
290 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
291 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
292
293 if (SIInstrInfo::isVALU(*MI))
294 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
295
296 if (SIInstrInfo::isDPP(*MI))
297 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
298
299 if (isDivFMas(MI->getOpcode()))
300 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
301
302 if (isRWLane(MI->getOpcode()))
303 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
304
305 if ((SIInstrInfo::isVALU(*MI) || SIInstrInfo::isVMEM(*MI) ||
306 SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI) ||
307 SIInstrInfo::isEXP(*MI)) && checkMAIVALUHazards(MI) > 0)
308 WaitStates = std::max(WaitStates, checkMAIVALUHazards(MI));
309
310 if (MI->isInlineAsm())
311 return std::max(WaitStates, checkInlineAsmHazards(MI));
312
313 if (isSGetReg(MI->getOpcode()))
314 return std::max(WaitStates, checkGetRegHazards(MI));
315
316 if (isSSetReg(MI->getOpcode()))
317 return std::max(WaitStates, checkSetRegHazards(MI));
318
319 if (isRFE(MI->getOpcode()))
320 return std::max(WaitStates, checkRFEHazards(MI));
321
322 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
323 isSMovRel(MI->getOpcode())))
324 return std::max(WaitStates, checkReadM0Hazards(MI));
325
326 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
327 return std::max(WaitStates, checkReadM0Hazards(MI));
328
329 if (SIInstrInfo::isMAI(*MI))
330 return std::max(WaitStates, checkMAIHazards(MI));
331
332 if (SIInstrInfo::isVMEM(*MI) ||
333 SIInstrInfo::isFLAT(*MI) ||
334 SIInstrInfo::isDS(*MI))
335 return std::max(WaitStates, checkMAILdStHazards(MI));
336
337 return WaitStates;
338 }
339
EmitNoop()340 void GCNHazardRecognizer::EmitNoop() {
341 EmittedInstrs.push_front(nullptr);
342 }
343
AdvanceCycle()344 void GCNHazardRecognizer::AdvanceCycle() {
345 // When the scheduler detects a stall, it will call AdvanceCycle() without
346 // emitting any instructions.
347 if (!CurrCycleInstr) {
348 EmittedInstrs.push_front(nullptr);
349 return;
350 }
351
352 // Do not track non-instructions which do not affect the wait states.
353 // If included, these instructions can lead to buffer overflow such that
354 // detectable hazards are missed.
355 if (CurrCycleInstr->isMetaInstruction()) {
356 CurrCycleInstr = nullptr;
357 return;
358 }
359
360 if (CurrCycleInstr->isBundle()) {
361 processBundle();
362 return;
363 }
364
365 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
366
367 // Keep track of emitted instructions
368 EmittedInstrs.push_front(CurrCycleInstr);
369
370 // Add a nullptr for each additional wait state after the first. Make sure
371 // not to add more than getMaxLookAhead() items to the list, since we
372 // truncate the list to that size right after this loop.
373 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
374 i < e; ++i) {
375 EmittedInstrs.push_front(nullptr);
376 }
377
378 // getMaxLookahead() is the largest number of wait states we will ever need
379 // to insert, so there is no point in keeping track of more than that many
380 // wait states.
381 EmittedInstrs.resize(getMaxLookAhead());
382
383 CurrCycleInstr = nullptr;
384 }
385
RecedeCycle()386 void GCNHazardRecognizer::RecedeCycle() {
387 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
388 }
389
390 //===----------------------------------------------------------------------===//
391 // Helper Functions
392 //===----------------------------------------------------------------------===//
393
394 typedef function_ref<bool(const MachineInstr &, int WaitStates)> IsExpiredFn;
395
396 // Returns a minimum wait states since \p I walking all predecessors.
397 // Only scans until \p IsExpired does not return true.
398 // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineBasicBlock * MBB,MachineBasicBlock::const_reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited)399 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
400 const MachineBasicBlock *MBB,
401 MachineBasicBlock::const_reverse_instr_iterator I,
402 int WaitStates, IsExpiredFn IsExpired,
403 DenseSet<const MachineBasicBlock *> &Visited) {
404 for (auto E = MBB->instr_rend(); I != E; ++I) {
405 // Don't add WaitStates for parent BUNDLE instructions.
406 if (I->isBundle())
407 continue;
408
409 if (IsHazard(*I))
410 return WaitStates;
411
412 if (I->isInlineAsm() || I->isMetaInstruction())
413 continue;
414
415 WaitStates += SIInstrInfo::getNumWaitStates(*I);
416
417 if (IsExpired(*I, WaitStates))
418 return std::numeric_limits<int>::max();
419 }
420
421 int MinWaitStates = std::numeric_limits<int>::max();
422 for (MachineBasicBlock *Pred : MBB->predecessors()) {
423 if (!Visited.insert(Pred).second)
424 continue;
425
426 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
427 WaitStates, IsExpired, Visited);
428
429 MinWaitStates = std::min(MinWaitStates, W);
430 }
431
432 return MinWaitStates;
433 }
434
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,const MachineInstr * MI,IsExpiredFn IsExpired)435 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
436 const MachineInstr *MI, IsExpiredFn IsExpired) {
437 DenseSet<const MachineBasicBlock *> Visited;
438 return getWaitStatesSince(IsHazard, MI->getParent(),
439 std::next(MI->getReverseIterator()),
440 0, IsExpired, Visited);
441 }
442
getWaitStatesSince(IsHazardFn IsHazard,int Limit)443 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
444 if (IsHazardRecognizerMode) {
445 auto IsExpiredFn = [Limit](const MachineInstr &, int WaitStates) {
446 return WaitStates >= Limit;
447 };
448 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
449 }
450
451 int WaitStates = 0;
452 for (MachineInstr *MI : EmittedInstrs) {
453 if (MI) {
454 if (IsHazard(*MI))
455 return WaitStates;
456
457 if (MI->isInlineAsm())
458 continue;
459 }
460 ++WaitStates;
461
462 if (WaitStates >= Limit)
463 break;
464 }
465 return std::numeric_limits<int>::max();
466 }
467
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)468 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
469 IsHazardFn IsHazardDef,
470 int Limit) {
471 const SIRegisterInfo *TRI = ST.getRegisterInfo();
472
473 auto IsHazardFn = [IsHazardDef, TRI, Reg](const MachineInstr &MI) {
474 return IsHazardDef(MI) && MI.modifiesRegister(Reg, TRI);
475 };
476
477 return getWaitStatesSince(IsHazardFn, Limit);
478 }
479
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)480 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
481 int Limit) {
482 auto IsHazardFn = [IsHazard](const MachineInstr &MI) {
483 return isSSetReg(MI.getOpcode()) && IsHazard(MI);
484 };
485
486 return getWaitStatesSince(IsHazardFn, Limit);
487 }
488
489 //===----------------------------------------------------------------------===//
490 // No-op Hazard Detection
491 //===----------------------------------------------------------------------===//
492
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,MCRegister Reg)493 static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
494 MCRegister Reg) {
495 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
496 BV.set(*RUI);
497 }
498
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & Set)499 static void addRegsToSet(const SIRegisterInfo &TRI,
500 iterator_range<MachineInstr::const_mop_iterator> Ops,
501 BitVector &Set) {
502 for (const MachineOperand &Op : Ops) {
503 if (Op.isReg())
504 addRegUnits(TRI, Set, Op.getReg().asMCReg());
505 }
506 }
507
addClauseInst(const MachineInstr & MI)508 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
509 // XXX: Do we need to worry about implicit operands
510 addRegsToSet(TRI, MI.defs(), ClauseDefs);
511 addRegsToSet(TRI, MI.uses(), ClauseUses);
512 }
513
breaksSMEMSoftClause(MachineInstr * MI)514 static bool breaksSMEMSoftClause(MachineInstr *MI) {
515 return !SIInstrInfo::isSMRD(*MI);
516 }
517
breaksVMEMSoftClause(MachineInstr * MI)518 static bool breaksVMEMSoftClause(MachineInstr *MI) {
519 return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
520 }
521
checkSoftClauseHazards(MachineInstr * MEM)522 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
523 // SMEM soft clause are only present on VI+, and only matter if xnack is
524 // enabled.
525 if (!ST.isXNACKEnabled())
526 return 0;
527
528 bool IsSMRD = TII.isSMRD(*MEM);
529
530 resetClause();
531
532 // A soft-clause is any group of consecutive SMEM instructions. The
533 // instructions in this group may return out of order and/or may be
534 // replayed (i.e. the same instruction issued more than once).
535 //
536 // In order to handle these situations correctly we need to make sure that
537 // when a clause has more than one instruction, no instruction in the clause
538 // writes to a register that is read by another instruction in the clause
539 // (including itself). If we encounter this situaion, we need to break the
540 // clause by inserting a non SMEM instruction.
541
542 for (MachineInstr *MI : EmittedInstrs) {
543 // When we hit a non-SMEM instruction then we have passed the start of the
544 // clause and we can stop.
545 if (!MI)
546 break;
547
548 if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
549 break;
550
551 addClauseInst(*MI);
552 }
553
554 if (ClauseDefs.none())
555 return 0;
556
557 // We need to make sure not to put loads and stores in the same clause if they
558 // use the same address. For now, just start a new clause whenever we see a
559 // store.
560 if (MEM->mayStore())
561 return 1;
562
563 addClauseInst(*MEM);
564
565 // If the set of defs and uses intersect then we cannot add this instruction
566 // to the clause, so we have a hazard.
567 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
568 }
569
checkSMRDHazards(MachineInstr * SMRD)570 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
571 int WaitStatesNeeded = 0;
572
573 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
574
575 // This SMRD hazard only affects SI.
576 if (!ST.hasSMRDReadVALUDefHazard())
577 return WaitStatesNeeded;
578
579 // A read of an SGPR by SMRD instruction requires 4 wait states when the
580 // SGPR was written by a VALU instruction.
581 int SmrdSgprWaitStates = 4;
582 auto IsHazardDefFn = [this](const MachineInstr &MI) {
583 return TII.isVALU(MI);
584 };
585 auto IsBufferHazardDefFn = [this](const MachineInstr &MI) {
586 return TII.isSALU(MI);
587 };
588
589 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
590
591 for (const MachineOperand &Use : SMRD->uses()) {
592 if (!Use.isReg())
593 continue;
594 int WaitStatesNeededForUse =
595 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
596 SmrdSgprWaitStates);
597 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
598
599 // This fixes what appears to be undocumented hardware behavior in SI where
600 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
601 // needs some number of nops in between. We don't know how many we need, but
602 // let's use 4. This wasn't discovered before probably because the only
603 // case when this happens is when we expand a 64-bit pointer into a full
604 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
605 // probably never encountered in the closed-source land.
606 if (IsBufferSMRD) {
607 int WaitStatesNeededForUse =
608 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
609 IsBufferHazardDefFn,
610 SmrdSgprWaitStates);
611 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
612 }
613 }
614
615 return WaitStatesNeeded;
616 }
617
checkVMEMHazards(MachineInstr * VMEM)618 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
619 if (!ST.hasVMEMReadSGPRVALUDefHazard())
620 return 0;
621
622 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
623
624 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
625 // SGPR was written by a VALU Instruction.
626 const int VmemSgprWaitStates = 5;
627 auto IsHazardDefFn = [this](const MachineInstr &MI) {
628 return TII.isVALU(MI);
629 };
630 for (const MachineOperand &Use : VMEM->uses()) {
631 if (!Use.isReg() || TRI.isVectorRegister(MF.getRegInfo(), Use.getReg()))
632 continue;
633
634 int WaitStatesNeededForUse =
635 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
636 VmemSgprWaitStates);
637 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
638 }
639 return WaitStatesNeeded;
640 }
641
checkDPPHazards(MachineInstr * DPP)642 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
643 const SIRegisterInfo *TRI = ST.getRegisterInfo();
644 const SIInstrInfo *TII = ST.getInstrInfo();
645
646 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
647 int DppVgprWaitStates = 2;
648 int DppExecWaitStates = 5;
649 int WaitStatesNeeded = 0;
650 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
651 return TII->isVALU(MI);
652 };
653
654 for (const MachineOperand &Use : DPP->uses()) {
655 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
656 continue;
657 int WaitStatesNeededForUse =
658 DppVgprWaitStates - getWaitStatesSinceDef(
659 Use.getReg(),
660 [](const MachineInstr &) { return true; },
661 DppVgprWaitStates);
662 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
663 }
664
665 WaitStatesNeeded = std::max(
666 WaitStatesNeeded,
667 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
668 DppExecWaitStates));
669
670 return WaitStatesNeeded;
671 }
672
checkDivFMasHazards(MachineInstr * DivFMas)673 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
674 const SIInstrInfo *TII = ST.getInstrInfo();
675
676 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
677 // instruction.
678 const int DivFMasWaitStates = 4;
679 auto IsHazardDefFn = [TII](const MachineInstr &MI) {
680 return TII->isVALU(MI);
681 };
682 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
683 DivFMasWaitStates);
684
685 return DivFMasWaitStates - WaitStatesNeeded;
686 }
687
checkGetRegHazards(MachineInstr * GetRegInstr)688 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
689 const SIInstrInfo *TII = ST.getInstrInfo();
690 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
691
692 const int GetRegWaitStates = 2;
693 auto IsHazardFn = [TII, GetRegHWReg](const MachineInstr &MI) {
694 return GetRegHWReg == getHWReg(TII, MI);
695 };
696 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
697
698 return GetRegWaitStates - WaitStatesNeeded;
699 }
700
checkSetRegHazards(MachineInstr * SetRegInstr)701 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
702 const SIInstrInfo *TII = ST.getInstrInfo();
703 unsigned HWReg = getHWReg(TII, *SetRegInstr);
704
705 const int SetRegWaitStates = ST.getSetRegWaitStates();
706 auto IsHazardFn = [TII, HWReg](const MachineInstr &MI) {
707 return HWReg == getHWReg(TII, MI);
708 };
709 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
710 return SetRegWaitStates - WaitStatesNeeded;
711 }
712
createsVALUHazard(const MachineInstr & MI)713 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
714 if (!MI.mayStore())
715 return -1;
716
717 const SIInstrInfo *TII = ST.getInstrInfo();
718 unsigned Opcode = MI.getOpcode();
719 const MCInstrDesc &Desc = MI.getDesc();
720
721 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
722 int VDataRCID = -1;
723 if (VDataIdx != -1)
724 VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
725
726 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
727 // There is no hazard if the instruction does not use vector regs
728 // (like wbinvl1)
729 if (VDataIdx == -1)
730 return -1;
731 // For MUBUF/MTBUF instructions this hazard only exists if the
732 // instruction is not using a register in the soffset field.
733 const MachineOperand *SOffset =
734 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
735 // If we have no soffset operand, then assume this field has been
736 // hardcoded to zero.
737 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
738 (!SOffset || !SOffset->isReg()))
739 return VDataIdx;
740 }
741
742 // MIMG instructions create a hazard if they don't use a 256-bit T# and
743 // the store size is greater than 8 bytes and they have more than two bits
744 // of their dmask set.
745 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
746 if (TII->isMIMG(MI)) {
747 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
748 assert(SRsrcIdx != -1 &&
749 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
750 (void)SRsrcIdx;
751 }
752
753 if (TII->isFLAT(MI)) {
754 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
755 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
756 return DataIdx;
757 }
758
759 return -1;
760 }
761
762 int
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)763 GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
764 const MachineRegisterInfo &MRI) {
765 // Helper to check for the hazard where VMEM instructions that store more than
766 // 8 bytes can have there store data over written by the next instruction.
767 const SIRegisterInfo *TRI = ST.getRegisterInfo();
768
769 const int VALUWaitStates = 1;
770 int WaitStatesNeeded = 0;
771
772 if (!TRI->isVectorRegister(MRI, Def.getReg()))
773 return WaitStatesNeeded;
774 Register Reg = Def.getReg();
775 auto IsHazardFn = [this, Reg, TRI](const MachineInstr &MI) {
776 int DataIdx = createsVALUHazard(MI);
777 return DataIdx >= 0 &&
778 TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
779 };
780 int WaitStatesNeededForDef =
781 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
782 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
783
784 return WaitStatesNeeded;
785 }
786
checkVALUHazards(MachineInstr * VALU)787 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
788 // This checks for the hazard where VMEM instructions that store more than
789 // 8 bytes can have there store data over written by the next instruction.
790 if (!ST.has12DWordStoreHazard())
791 return 0;
792
793 const MachineRegisterInfo &MRI = MF.getRegInfo();
794 int WaitStatesNeeded = 0;
795
796 for (const MachineOperand &Def : VALU->defs()) {
797 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
798 }
799
800 return WaitStatesNeeded;
801 }
802
checkInlineAsmHazards(MachineInstr * IA)803 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
804 // This checks for hazards associated with inline asm statements.
805 // Since inline asms can contain just about anything, we use this
806 // to call/leverage other check*Hazard routines. Note that
807 // this function doesn't attempt to address all possible inline asm
808 // hazards (good luck), but is a collection of what has been
809 // problematic thus far.
810
811 // see checkVALUHazards()
812 if (!ST.has12DWordStoreHazard())
813 return 0;
814
815 const MachineRegisterInfo &MRI = MF.getRegInfo();
816 int WaitStatesNeeded = 0;
817
818 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
819 I != E; ++I) {
820 const MachineOperand &Op = IA->getOperand(I);
821 if (Op.isReg() && Op.isDef()) {
822 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
823 }
824 }
825
826 return WaitStatesNeeded;
827 }
828
checkRWLaneHazards(MachineInstr * RWLane)829 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
830 const SIInstrInfo *TII = ST.getInstrInfo();
831 const SIRegisterInfo *TRI = ST.getRegisterInfo();
832 const MachineRegisterInfo &MRI = MF.getRegInfo();
833
834 const MachineOperand *LaneSelectOp =
835 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
836
837 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
838 return 0;
839
840 Register LaneSelectReg = LaneSelectOp->getReg();
841 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVALU(MI); };
842
843 const int RWLaneWaitStates = 4;
844 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
845 RWLaneWaitStates);
846 return RWLaneWaitStates - WaitStatesSince;
847 }
848
checkRFEHazards(MachineInstr * RFE)849 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
850 if (!ST.hasRFEHazards())
851 return 0;
852
853 const SIInstrInfo *TII = ST.getInstrInfo();
854
855 const int RFEWaitStates = 1;
856
857 auto IsHazardFn = [TII](const MachineInstr &MI) {
858 return getHWReg(TII, MI) == AMDGPU::Hwreg::ID_TRAPSTS;
859 };
860 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
861 return RFEWaitStates - WaitStatesNeeded;
862 }
863
checkReadM0Hazards(MachineInstr * MI)864 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
865 const SIInstrInfo *TII = ST.getInstrInfo();
866 const int SMovRelWaitStates = 1;
867 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isSALU(MI); };
868 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
869 SMovRelWaitStates);
870 }
871
fixHazards(MachineInstr * MI)872 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
873 fixVMEMtoScalarWriteHazards(MI);
874 fixVcmpxPermlaneHazards(MI);
875 fixSMEMtoVectorWriteHazards(MI);
876 fixVcmpxExecWARHazard(MI);
877 fixLdsBranchVmemWARHazard(MI);
878 }
879
fixVcmpxPermlaneHazards(MachineInstr * MI)880 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
881 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
882 return false;
883
884 const SIInstrInfo *TII = ST.getInstrInfo();
885 auto IsHazardFn = [TII](const MachineInstr &MI) { return TII->isVOPC(MI); };
886
887 auto IsExpiredFn = [](const MachineInstr &MI, int) {
888 unsigned Opc = MI.getOpcode();
889 return SIInstrInfo::isVALU(MI) && Opc != AMDGPU::V_NOP_e32 &&
890 Opc != AMDGPU::V_NOP_e64 && Opc != AMDGPU::V_NOP_sdwa;
891 };
892
893 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
894 std::numeric_limits<int>::max())
895 return false;
896
897 // V_NOP will be discarded by SQ.
898 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
899 // which is always a VGPR and available.
900 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
901 Register Reg = Src0->getReg();
902 bool IsUndef = Src0->isUndef();
903 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
904 TII->get(AMDGPU::V_MOV_B32_e32))
905 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
906 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
907
908 return true;
909 }
910
fixVMEMtoScalarWriteHazards(MachineInstr * MI)911 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
912 if (!ST.hasVMEMtoScalarWriteHazard())
913 return false;
914
915 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
916 return false;
917
918 if (MI->getNumDefs() == 0)
919 return false;
920
921 const SIRegisterInfo *TRI = ST.getRegisterInfo();
922
923 auto IsHazardFn = [TRI, MI](const MachineInstr &I) {
924 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isDS(I) &&
925 !SIInstrInfo::isFLAT(I))
926 return false;
927
928 for (const MachineOperand &Def : MI->defs()) {
929 const MachineOperand *Op =
930 I.findRegisterUseOperand(Def.getReg(), false, TRI);
931 if (!Op)
932 continue;
933 return true;
934 }
935 return false;
936 };
937
938 auto IsExpiredFn = [](const MachineInstr &MI, int) {
939 return SIInstrInfo::isVALU(MI) ||
940 (MI.getOpcode() == AMDGPU::S_WAITCNT &&
941 !MI.getOperand(0).getImm()) ||
942 (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
943 MI.getOperand(0).getImm() == 0xffe3);
944 };
945
946 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
947 std::numeric_limits<int>::max())
948 return false;
949
950 const SIInstrInfo *TII = ST.getInstrInfo();
951 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
952 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
953 .addImm(0xffe3);
954 return true;
955 }
956
fixSMEMtoVectorWriteHazards(MachineInstr * MI)957 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
958 if (!ST.hasSMEMtoVectorWriteHazard())
959 return false;
960
961 if (!SIInstrInfo::isVALU(*MI))
962 return false;
963
964 unsigned SDSTName;
965 switch (MI->getOpcode()) {
966 case AMDGPU::V_READLANE_B32:
967 case AMDGPU::V_READFIRSTLANE_B32:
968 SDSTName = AMDGPU::OpName::vdst;
969 break;
970 default:
971 SDSTName = AMDGPU::OpName::sdst;
972 break;
973 }
974
975 const SIInstrInfo *TII = ST.getInstrInfo();
976 const SIRegisterInfo *TRI = ST.getRegisterInfo();
977 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
978 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
979 if (!SDST) {
980 for (const auto &MO : MI->implicit_operands()) {
981 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
982 SDST = &MO;
983 break;
984 }
985 }
986 }
987
988 if (!SDST)
989 return false;
990
991 const Register SDSTReg = SDST->getReg();
992 auto IsHazardFn = [SDSTReg, TRI](const MachineInstr &I) {
993 return SIInstrInfo::isSMRD(I) && I.readsRegister(SDSTReg, TRI);
994 };
995
996 auto IsExpiredFn = [TII, IV](const MachineInstr &MI, int) {
997 if (TII->isSALU(MI)) {
998 switch (MI.getOpcode()) {
999 case AMDGPU::S_SETVSKIP:
1000 case AMDGPU::S_VERSION:
1001 case AMDGPU::S_WAITCNT_VSCNT:
1002 case AMDGPU::S_WAITCNT_VMCNT:
1003 case AMDGPU::S_WAITCNT_EXPCNT:
1004 // These instructions cannot not mitigate the hazard.
1005 return false;
1006 case AMDGPU::S_WAITCNT_LGKMCNT:
1007 // Reducing lgkmcnt count to 0 always mitigates the hazard.
1008 return (MI.getOperand(1).getImm() == 0) &&
1009 (MI.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1010 case AMDGPU::S_WAITCNT: {
1011 const int64_t Imm = MI.getOperand(0).getImm();
1012 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1013 return (Decoded.LgkmCnt == 0);
1014 }
1015 default:
1016 // SOPP instructions cannot mitigate the hazard.
1017 if (TII->isSOPP(MI))
1018 return false;
1019 // At this point the SALU can be assumed to mitigate the hazard
1020 // because either:
1021 // (a) it is independent of the at risk SMEM (breaking chain),
1022 // or
1023 // (b) it is dependent on the SMEM, in which case an appropriate
1024 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1025 // SMEM instruction.
1026 return true;
1027 }
1028 }
1029 return false;
1030 };
1031
1032 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1033 std::numeric_limits<int>::max())
1034 return false;
1035
1036 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1037 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1038 .addImm(0);
1039 return true;
1040 }
1041
fixVcmpxExecWARHazard(MachineInstr * MI)1042 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1043 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1044 return false;
1045
1046 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1047 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1048 return false;
1049
1050 auto IsHazardFn = [TRI](const MachineInstr &I) {
1051 if (SIInstrInfo::isVALU(I))
1052 return false;
1053 return I.readsRegister(AMDGPU::EXEC, TRI);
1054 };
1055
1056 const SIInstrInfo *TII = ST.getInstrInfo();
1057 auto IsExpiredFn = [TII, TRI](const MachineInstr &MI, int) {
1058 if (SIInstrInfo::isVALU(MI)) {
1059 if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst))
1060 return true;
1061 for (auto MO : MI.implicit_operands())
1062 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1063 return true;
1064 }
1065 if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1066 (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
1067 return true;
1068 return false;
1069 };
1070
1071 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1072 std::numeric_limits<int>::max())
1073 return false;
1074
1075 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1076 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1077 .addImm(0xfffe);
1078 return true;
1079 }
1080
shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction & MF,const GCNSubtarget & ST)1081 static bool shouldRunLdsBranchVmemWARHazardFixup(const MachineFunction &MF,
1082 const GCNSubtarget &ST) {
1083 if (!ST.hasLdsBranchVmemWARHazard())
1084 return false;
1085
1086 // Check if the necessary condition for the hazard is met: both LDS and VMEM
1087 // instructions need to appear in the same function.
1088 bool HasLds = false;
1089 bool HasVmem = false;
1090 for (auto &MBB : MF) {
1091 for (auto &MI : MBB) {
1092 HasLds |= SIInstrInfo::isDS(MI);
1093 HasVmem |=
1094 SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI);
1095 if (HasLds && HasVmem)
1096 return true;
1097 }
1098 }
1099 return false;
1100 }
1101
fixLdsBranchVmemWARHazard(MachineInstr * MI)1102 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1103 if (!RunLdsBranchVmemWARHazardFixup)
1104 return false;
1105
1106 assert(ST.hasLdsBranchVmemWARHazard());
1107
1108 auto IsHazardInst = [](const MachineInstr &MI) {
1109 if (SIInstrInfo::isDS(MI))
1110 return 1;
1111 if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
1112 return 2;
1113 return 0;
1114 };
1115
1116 auto InstType = IsHazardInst(*MI);
1117 if (!InstType)
1118 return false;
1119
1120 auto IsExpiredFn = [&IsHazardInst](const MachineInstr &I, int) {
1121 return IsHazardInst(I) || (I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1122 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1123 !I.getOperand(1).getImm());
1124 };
1125
1126 auto IsHazardFn = [InstType, &IsHazardInst](const MachineInstr &I) {
1127 if (!I.isBranch())
1128 return false;
1129
1130 auto IsHazardFn = [InstType, IsHazardInst](const MachineInstr &I) {
1131 auto InstType2 = IsHazardInst(I);
1132 return InstType2 && InstType != InstType2;
1133 };
1134
1135 auto IsExpiredFn = [InstType, &IsHazardInst](const MachineInstr &I, int) {
1136 auto InstType2 = IsHazardInst(I);
1137 if (InstType == InstType2)
1138 return true;
1139
1140 return I.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1141 I.getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1142 !I.getOperand(1).getImm();
1143 };
1144
1145 return ::getWaitStatesSince(IsHazardFn, &I, IsExpiredFn) !=
1146 std::numeric_limits<int>::max();
1147 };
1148
1149 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1150 std::numeric_limits<int>::max())
1151 return false;
1152
1153 const SIInstrInfo *TII = ST.getInstrInfo();
1154 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1155 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1156 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1157 .addImm(0);
1158
1159 return true;
1160 }
1161
checkNSAtoVMEMHazard(MachineInstr * MI)1162 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1163 int NSAtoVMEMWaitStates = 1;
1164
1165 if (!ST.hasNSAtoVMEMBug())
1166 return 0;
1167
1168 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1169 return 0;
1170
1171 const SIInstrInfo *TII = ST.getInstrInfo();
1172 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1173 if (!Offset || (Offset->getImm() & 6) == 0)
1174 return 0;
1175
1176 auto IsHazardFn = [TII](const MachineInstr &I) {
1177 if (!SIInstrInfo::isMIMG(I))
1178 return false;
1179 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I.getOpcode());
1180 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1181 TII->getInstSizeInBytes(I) >= 16;
1182 };
1183
1184 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1185 }
1186
checkFPAtomicToDenormModeHazard(MachineInstr * MI)1187 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1188 int FPAtomicToDenormModeWaitStates = 3;
1189
1190 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1191 return 0;
1192
1193 auto IsHazardFn = [](const MachineInstr &I) {
1194 if (!SIInstrInfo::isVMEM(I) && !SIInstrInfo::isFLAT(I))
1195 return false;
1196 return SIInstrInfo::isFPAtomic(I);
1197 };
1198
1199 auto IsExpiredFn = [](const MachineInstr &MI, int WaitStates) {
1200 if (WaitStates >= 3 || SIInstrInfo::isVALU(MI))
1201 return true;
1202
1203 switch (MI.getOpcode()) {
1204 case AMDGPU::S_WAITCNT:
1205 case AMDGPU::S_WAITCNT_VSCNT:
1206 case AMDGPU::S_WAITCNT_VMCNT:
1207 case AMDGPU::S_WAITCNT_EXPCNT:
1208 case AMDGPU::S_WAITCNT_LGKMCNT:
1209 case AMDGPU::S_WAIT_IDLE:
1210 return true;
1211 default:
1212 break;
1213 }
1214
1215 return false;
1216 };
1217
1218 return FPAtomicToDenormModeWaitStates -
1219 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1220 }
1221
checkMAIHazards(MachineInstr * MI)1222 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1223 assert(SIInstrInfo::isMAI(*MI));
1224
1225 return ST.hasGFX90AInsts() ? checkMAIHazards90A(MI) : checkMAIHazards908(MI);
1226 }
1227
checkMAIHazards908(MachineInstr * MI)1228 int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
1229 int WaitStatesNeeded = 0;
1230 unsigned Opc = MI->getOpcode();
1231
1232 auto IsVALUFn = [](const MachineInstr &MI) {
1233 return SIInstrInfo::isVALU(MI);
1234 };
1235
1236 if (Opc != AMDGPU::V_ACCVGPR_READ_B32_e64) { // MFMA or v_accvgpr_write
1237 const int LegacyVALUWritesVGPRWaitStates = 2;
1238 const int VALUWritesExecWaitStates = 4;
1239 const int MaxWaitStates = 4;
1240
1241 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1242 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1243 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1244
1245 if (WaitStatesNeeded < MaxWaitStates) {
1246 for (const MachineOperand &Use : MI->explicit_uses()) {
1247 const int MaxWaitStates = 2;
1248
1249 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1250 continue;
1251
1252 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1253 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1254 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1255
1256 if (WaitStatesNeeded == MaxWaitStates)
1257 break;
1258 }
1259 }
1260 }
1261
1262 auto IsMFMAFn = [](const MachineInstr &MI) {
1263 return SIInstrInfo::isMAI(MI) &&
1264 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1265 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1266 };
1267
1268 for (const MachineOperand &Op : MI->explicit_operands()) {
1269 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1270 continue;
1271
1272 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1273 continue;
1274
1275 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1276 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1277 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1278 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1279 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1280 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1281 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1282 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1283 const int MaxWaitStates = 18;
1284 Register Reg = Op.getReg();
1285 unsigned HazardDefLatency = 0;
1286
1287 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency,
1288 this](const MachineInstr &MI) {
1289 if (!IsMFMAFn(MI))
1290 return false;
1291 Register DstReg = MI.getOperand(0).getReg();
1292 if (DstReg == Reg)
1293 return false;
1294 HazardDefLatency =
1295 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1296 return TRI.regsOverlap(DstReg, Reg);
1297 };
1298
1299 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1300 MaxWaitStates);
1301 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1302 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1303 int OpNo = MI->getOperandNo(&Op);
1304 if (OpNo == SrcCIdx) {
1305 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1306 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
1307 switch (HazardDefLatency) {
1308 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1309 break;
1310 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1311 break;
1312 case 16: LLVM_FALLTHROUGH;
1313 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1314 break;
1315 }
1316 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1317 switch (HazardDefLatency) {
1318 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1319 break;
1320 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1321 break;
1322 case 16: LLVM_FALLTHROUGH;
1323 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1324 break;
1325 }
1326 }
1327
1328 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1329 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1330
1331 if (WaitStatesNeeded == MaxWaitStates)
1332 return WaitStatesNeeded; // Early exit.
1333
1334 auto IsAccVgprWriteFn = [Reg, this](const MachineInstr &MI) {
1335 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1336 return false;
1337 Register DstReg = MI.getOperand(0).getReg();
1338 return TRI.regsOverlap(Reg, DstReg);
1339 };
1340
1341 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1342 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1343 const int AccVGPRWriteAccVgprReadWaitStates = 3;
1344 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1345 if (OpNo == SrcCIdx)
1346 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1347 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64)
1348 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1349
1350 WaitStatesNeededForUse = NeedWaitStates -
1351 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1352 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1353
1354 if (WaitStatesNeeded == MaxWaitStates)
1355 return WaitStatesNeeded; // Early exit.
1356 }
1357
1358 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
1359 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1360 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1361 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1362 const int MaxWaitStates = 13;
1363 Register DstReg = MI->getOperand(0).getReg();
1364 unsigned HazardDefLatency = 0;
1365
1366 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency,
1367 this](const MachineInstr &MI) {
1368 if (!IsMFMAFn(MI))
1369 return false;
1370 Register Reg = TII.getNamedOperand(MI, AMDGPU::OpName::src2)->getReg();
1371 HazardDefLatency =
1372 std::max(HazardDefLatency, TSchedModel.computeInstrLatency(&MI));
1373 return TRI.regsOverlap(Reg, DstReg);
1374 };
1375
1376 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1377 int NeedWaitStates;
1378 switch (HazardDefLatency) {
1379 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1380 break;
1381 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1382 break;
1383 case 16: LLVM_FALLTHROUGH;
1384 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1385 break;
1386 }
1387
1388 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1389 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1390 }
1391
1392 return WaitStatesNeeded;
1393 }
1394
checkMAIHazards90A(MachineInstr * MI)1395 int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
1396 int WaitStatesNeeded = 0;
1397 unsigned Opc = MI->getOpcode();
1398
1399 auto IsMFMAFn = [](const MachineInstr &MI) {
1400 return SIInstrInfo::isMAI(MI) &&
1401 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1402 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1403 };
1404
1405 auto IsLegacyVALUFn = [&IsMFMAFn](const MachineInstr &MI) {
1406 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI);
1407 };
1408
1409 auto IsLegacyVALUNotDotFn = [&IsMFMAFn](const MachineInstr &MI) {
1410 return SIInstrInfo::isVALU(MI) && !IsMFMAFn(MI) && !SIInstrInfo::isDOT(MI);
1411 };
1412
1413 if (!IsMFMAFn(*MI))
1414 return WaitStatesNeeded;
1415
1416 const int VALUWritesExecWaitStates = 4;
1417 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1418 getWaitStatesSinceDef(AMDGPU::EXEC, IsLegacyVALUFn,
1419 VALUWritesExecWaitStates);
1420 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1421
1422 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1423
1424 // Loop for both DGEMM and S/HGEMM 2nd instruction.
1425 for (const MachineOperand &Use : MI->explicit_uses()) {
1426 const int LegacyVALUNotDotWritesVGPRWaitStates = 2;
1427 const int SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates = 2;
1428 const int SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates = 8;
1429 const int SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates = 16;
1430 const int SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates = 3;
1431 const int SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates = 9;
1432 const int SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates = 17;
1433 const int DMFMA16x16WritesVGPROverlappedSrcCWaitStates = 9;
1434 const int DMFMA4x4WritesVGPROverlappedSrcCWaitStates = 4;
1435 const int SMFMA4x4WritesVGPROverlappedSrcABWaitStates = 5;
1436 const int SMFMA16x16WritesVGPROverlappedSrcABWaitStates = 11;
1437 const int SMFMA32x32WritesVGPROverlappedSrcABWaitStates = 19;
1438 const int DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates = 6;
1439 const int DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates = 11;
1440 const int DMFMA4x4WritesVGPRFullSrcCWaitStates = 4;
1441 const int MaxWaitStates = 19;
1442
1443 if (!Use.isReg())
1444 continue;
1445 unsigned Reg = Use.getReg();
1446 bool FullReg;
1447 const MachineInstr *MI1;
1448
1449 auto IsOverlappedDGEMMorXDLFn = [Reg, &IsMFMAFn, &FullReg, &MI1,
1450 this](const MachineInstr &MI) {
1451 if (!IsMFMAFn(MI))
1452 return false;
1453 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1454 return false;
1455 Register DstReg = MI.getOperand(0).getReg();
1456 FullReg = (DstReg == Reg);
1457 MI1 = &MI;
1458 return TRI.regsOverlap(DstReg, Reg);
1459 };
1460
1461 WaitStatesNeededForUse = LegacyVALUNotDotWritesVGPRWaitStates -
1462 getWaitStatesSinceDef(Reg, IsLegacyVALUNotDotFn, MaxWaitStates);
1463 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1464
1465 int NumWaitStates = getWaitStatesSinceDef(Reg, IsOverlappedDGEMMorXDLFn,
1466 MaxWaitStates);
1467 if (NumWaitStates == std::numeric_limits<int>::max())
1468 continue;
1469
1470 int OpNo = MI->getOperandNo(&Use);
1471 unsigned Opc1 = MI1->getOpcode();
1472 int NeedWaitStates = 0;
1473 if (OpNo == SrcCIdx) {
1474 if (!isDGEMM(Opc) && isDGEMM(Opc1)) {
1475 NeedWaitStates = 0;
1476 } else if (FullReg) {
1477 if ((Opc == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1478 Opc == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64) &&
1479 (Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_e64 ||
1480 Opc1 == AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64))
1481 NeedWaitStates = DMFMA4x4WritesVGPRFullSrcCWaitStates;
1482 } else {
1483 switch (Opc1) {
1484 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1485 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1486 if (!isXDL(ST, *MI))
1487 NeedWaitStates = DMFMA16x16WritesVGPROverlappedSrcCWaitStates;
1488 break;
1489 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1490 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1491 if (!isXDL(ST, *MI))
1492 NeedWaitStates = DMFMA4x4WritesVGPROverlappedSrcCWaitStates;
1493 break;
1494 default:
1495 switch (TSchedModel.computeInstrLatency(MI1)) {
1496 case 2:
1497 NeedWaitStates = isDGEMM(Opc)
1498 ? SMFMA4x4WritesVGPROverlappedDMFMASrcCWaitStates
1499 : SMFMA4x4WritesVGPROverlappedSMFMASrcCWaitStates;
1500 break;
1501 case 8:
1502 NeedWaitStates = isDGEMM(Opc)
1503 ? SMFMA16x16WritesVGPROverlappedDMFMASrcCWaitStates
1504 : SMFMA16x16WritesVGPROverlappedSMFMASrcCWaitStates;
1505 break;
1506 case 16: LLVM_FALLTHROUGH;
1507 default:
1508 NeedWaitStates = isDGEMM(Opc)
1509 ? SMFMA32x32WritesVGPROverlappedDMFMASrcCWaitStates
1510 : SMFMA32x32WritesVGPROverlappedSMFMASrcCWaitStates;
1511 }
1512 }
1513 }
1514 } else {
1515 switch (Opc1) {
1516 case AMDGPU::V_MFMA_F64_16X16X4F64_e64:
1517 case AMDGPU::V_MFMA_F64_16X16X4F64_vgprcd_e64:
1518 NeedWaitStates = DMFMA16x16WritesVGPROverlappedMFMASrcABWaitStates;
1519 break;
1520 case AMDGPU::V_MFMA_F64_4X4X4F64_e64:
1521 case AMDGPU::V_MFMA_F64_4X4X4F64_vgprcd_e64:
1522 NeedWaitStates = DMFMA4x4WritesVGPROverlappedMFMASrcABWaitStates;
1523 break;
1524 default:
1525 switch (TSchedModel.computeInstrLatency(MI1)) {
1526 case 2:
1527 NeedWaitStates = SMFMA4x4WritesVGPROverlappedSrcABWaitStates;
1528 break;
1529 case 8:
1530 NeedWaitStates = SMFMA16x16WritesVGPROverlappedSrcABWaitStates;
1531 break;
1532 case 16: LLVM_FALLTHROUGH;
1533 default:
1534 NeedWaitStates = SMFMA32x32WritesVGPROverlappedSrcABWaitStates;
1535 }
1536 }
1537 }
1538 if (WaitStatesNeeded >= NeedWaitStates)
1539 continue;
1540
1541 WaitStatesNeededForUse = NeedWaitStates - NumWaitStates;
1542 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1543
1544 if (WaitStatesNeeded == MaxWaitStates)
1545 break;
1546 }
1547
1548 return WaitStatesNeeded;
1549 }
1550
checkMAILdStHazards(MachineInstr * MI)1551 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1552 // On gfx90a+ releveant hazards are checked in checkMAIVALUHazards()
1553 if (!ST.hasMAIInsts() || ST.hasGFX90AInsts())
1554 return 0;
1555
1556 int WaitStatesNeeded = 0;
1557
1558 auto IsAccVgprReadFn = [](const MachineInstr &MI) {
1559 return MI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64;
1560 };
1561
1562 for (const MachineOperand &Op : MI->explicit_uses()) {
1563 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1564 continue;
1565
1566 Register Reg = Op.getReg();
1567
1568 const int AccVgprReadLdStWaitStates = 2;
1569 const int VALUWriteAccVgprRdWrLdStDepVALUWaitStates = 1;
1570 const int MaxWaitStates = 2;
1571
1572 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1573 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1575
1576 if (WaitStatesNeeded == MaxWaitStates)
1577 return WaitStatesNeeded; // Early exit.
1578
1579 auto IsVALUAccVgprRdWrCheckFn = [Reg, this](const MachineInstr &MI) {
1580 if (MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64 &&
1581 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
1582 return false;
1583 auto IsVALUFn = [](const MachineInstr &MI) {
1584 return SIInstrInfo::isVALU(MI) && !SIInstrInfo::isMAI(MI);
1585 };
1586 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1587 std::numeric_limits<int>::max();
1588 };
1589
1590 WaitStatesNeededForUse = VALUWriteAccVgprRdWrLdStDepVALUWaitStates -
1591 getWaitStatesSince(IsVALUAccVgprRdWrCheckFn, MaxWaitStates);
1592 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1593 }
1594
1595 return WaitStatesNeeded;
1596 }
1597
checkMAIVALUHazards(MachineInstr * MI)1598 int GCNHazardRecognizer::checkMAIVALUHazards(MachineInstr *MI) {
1599 if (!ST.hasGFX90AInsts())
1600 return 0;
1601
1602 auto IsMFMAFn = [](const MachineInstr &MI) -> bool {
1603 return SIInstrInfo::isMAI(MI) &&
1604 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1605 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64;
1606 };
1607
1608 auto IsDGEMMFn = [](const MachineInstr &MI) -> bool {
1609 return isDGEMM(MI.getOpcode());
1610 };
1611
1612 // This is checked in checkMAIHazards90A()
1613 if (IsMFMAFn(*MI))
1614 return 0;
1615
1616 int WaitStatesNeeded = 0;
1617
1618 bool IsMemOrExport = SIInstrInfo::isVMEM(*MI) ||
1619 SIInstrInfo::isFLAT(*MI) ||
1620 SIInstrInfo::isDS(*MI) ||
1621 SIInstrInfo::isEXP(*MI);
1622 bool IsVALU = SIInstrInfo::isVALU(*MI);
1623
1624 const MachineInstr *MFMA = nullptr;
1625 unsigned Reg;
1626 auto IsDGEMMorXDLWriteFn = [&Reg, &IsMFMAFn, &MFMA,
1627 this](const MachineInstr &MI) {
1628 if (!IsMFMAFn(MI) || !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1629 return false;
1630 if (!isDGEMM(MI.getOpcode()) && !isXDL(ST, MI))
1631 return false;
1632 MFMA = &MI;
1633 return true;
1634 };
1635
1636 const MachineInstr *DOT = nullptr;
1637 auto IsDotWriteFn = [&Reg, &DOT, this](const MachineInstr &MI) {
1638 if (!SIInstrInfo::isDOT(MI) ||
1639 !TRI.regsOverlap(MI.getOperand(0).getReg(), Reg))
1640 return false;
1641 DOT = &MI;
1642 return true;
1643 };
1644
1645 int SrcCIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
1646 AMDGPU::OpName::src2);
1647
1648 if (IsMemOrExport || IsVALU) {
1649 const int SMFMA4x4WriteVgprVALUMemExpReadWaitStates = 5;
1650 const int SMFMA16x16WriteVgprVALUMemExpReadWaitStates = 11;
1651 const int SMFMA32x32WriteVgprVALUMemExpReadWaitStates = 19;
1652 const int DMFMA4x4WriteVgprMemExpReadWaitStates = 9;
1653 const int DMFMA16x16WriteVgprMemExpReadWaitStates = 18;
1654 const int DMFMA4x4WriteVgprVALUReadWaitStates = 6;
1655 const int DMFMA16x16WriteVgprVALUReadWaitStates = 11;
1656 const int DotWriteSameDotReadSrcAB = 3;
1657 const int DotWriteDifferentVALURead = 3;
1658 const int MaxWaitStates = 19;
1659
1660 for (const MachineOperand &Use : MI->explicit_uses()) {
1661 if (!Use.isReg())
1662 continue;
1663 Reg = Use.getReg();
1664
1665 DOT = nullptr;
1666 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1667 MaxWaitStates);
1668 if (DOT) {
1669 int NeedWaitStates = 0;
1670 if (DOT->getOpcode() == MI->getOpcode()) {
1671 if (&Use - &MI->getOperand(0) != SrcCIdx)
1672 NeedWaitStates = DotWriteSameDotReadSrcAB;
1673 } else {
1674 NeedWaitStates = DotWriteDifferentVALURead;
1675 }
1676
1677 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1678 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1679 }
1680
1681 MFMA = nullptr;
1682 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1683 MaxWaitStates);
1684 if (!MFMA)
1685 continue;
1686
1687 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1688 int NeedWaitStates = MaxWaitStates;
1689 switch (HazardDefLatency) {
1690 case 2:
1691 NeedWaitStates = SMFMA4x4WriteVgprVALUMemExpReadWaitStates;
1692 break;
1693 case 4:
1694 assert(isDGEMM(MFMA->getOpcode()));
1695 NeedWaitStates =
1696 IsMemOrExport ? DMFMA4x4WriteVgprMemExpReadWaitStates
1697 : DMFMA4x4WriteVgprVALUReadWaitStates;
1698 break;
1699 case 8:
1700 NeedWaitStates = SMFMA16x16WriteVgprVALUMemExpReadWaitStates;
1701 break;
1702 case 16: LLVM_FALLTHROUGH;
1703 default:
1704 NeedWaitStates =
1705 isDGEMM(MFMA->getOpcode())
1706 ? IsMemOrExport ? DMFMA16x16WriteVgprMemExpReadWaitStates
1707 : DMFMA16x16WriteVgprVALUReadWaitStates
1708 : SMFMA32x32WriteVgprVALUMemExpReadWaitStates;
1709 break;
1710 }
1711
1712 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1713 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1714
1715 if (WaitStatesNeeded == MaxWaitStates)
1716 break;
1717 }
1718 }
1719
1720 unsigned Opc = MI->getOpcode();
1721 const int DMFMAToFMA64WaitStates = 2;
1722 if ((Opc == AMDGPU::V_FMA_F64_e64 ||
1723 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64 ||
1724 Opc == AMDGPU::V_FMAC_F64_dpp) &&
1725 WaitStatesNeeded < DMFMAToFMA64WaitStates) {
1726 int WaitStatesNeededForUse = DMFMAToFMA64WaitStates -
1727 getWaitStatesSince(IsDGEMMFn, DMFMAToFMA64WaitStates);
1728 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1729 }
1730
1731 if (!IsVALU && !IsMemOrExport)
1732 return WaitStatesNeeded;
1733
1734 for (const MachineOperand &Def : MI->defs()) {
1735 const int SMFMA4x4WriteVgprVALUWawWaitStates = 5;
1736 const int SMFMA16x16WriteVgprVALUWawWaitStates = 11;
1737 const int SMFMA32x32WriteVgprVALUWawWaitStates = 19;
1738 const int SMFMA4x4ReadVgprVALUWarWaitStates = 1;
1739 const int SMFMA16x16ReadVgprVALUWarWaitStates = 7;
1740 const int SMFMA32x32ReadVgprVALUWarWaitStates = 15;
1741 const int DMFMA4x4WriteVgprVALUWriteWaitStates = 6;
1742 const int DMFMA16x16WriteVgprVALUWriteWaitStates = 11;
1743 const int DotWriteDifferentVALUWrite = 3;
1744 const int MaxWaitStates = 19;
1745 const int MaxWarWaitStates = 15;
1746
1747 Reg = Def.getReg();
1748
1749 DOT = nullptr;
1750 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDotWriteFn,
1751 MaxWaitStates);
1752 if (DOT && DOT->getOpcode() != MI->getOpcode())
1753 WaitStatesNeeded = std::max(WaitStatesNeeded, DotWriteDifferentVALUWrite -
1754 WaitStatesSinceDef);
1755
1756 MFMA = nullptr;
1757 WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsDGEMMorXDLWriteFn,
1758 MaxWaitStates);
1759 if (MFMA) {
1760 int NeedWaitStates = MaxWaitStates;
1761 switch (TSchedModel.computeInstrLatency(MFMA)) {
1762 case 2:
1763 NeedWaitStates = SMFMA4x4WriteVgprVALUWawWaitStates;
1764 break;
1765 case 4:
1766 assert(isDGEMM(MFMA->getOpcode()));
1767 NeedWaitStates = DMFMA4x4WriteVgprVALUWriteWaitStates;
1768 break;
1769 case 8:
1770 NeedWaitStates = SMFMA16x16WriteVgprVALUWawWaitStates;
1771 break;
1772 case 16: LLVM_FALLTHROUGH;
1773 default:
1774 NeedWaitStates = isDGEMM(MFMA->getOpcode())
1775 ? DMFMA16x16WriteVgprVALUWriteWaitStates
1776 : SMFMA32x32WriteVgprVALUWawWaitStates;
1777 break;
1778 }
1779
1780 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1781 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1782
1783 if (WaitStatesNeeded == MaxWaitStates)
1784 break;
1785 }
1786
1787 auto IsSMFMAReadAsCFn = [&Reg, &IsMFMAFn, &MFMA,
1788 this](const MachineInstr &MI) {
1789 if (!IsMFMAFn(MI) || isDGEMM(MI.getOpcode()) ||
1790 !MI.readsRegister(Reg, &TRI))
1791 return false;
1792
1793 const MachineOperand *SrcC =
1794 TII.getNamedOperand(MI, AMDGPU::OpName::src2);
1795 assert(SrcC);
1796 if (!SrcC->isReg() || !TRI.regsOverlap(SrcC->getReg(), Reg))
1797 return false;
1798
1799 MFMA = &MI;
1800 return true;
1801 };
1802
1803 MFMA = nullptr;
1804 int WaitStatesSinceUse = getWaitStatesSince(IsSMFMAReadAsCFn,
1805 MaxWarWaitStates);
1806 if (!MFMA)
1807 continue;
1808
1809 unsigned HazardDefLatency = TSchedModel.computeInstrLatency(MFMA);
1810 int NeedWaitStates = MaxWaitStates;
1811 switch (HazardDefLatency) {
1812 case 2: NeedWaitStates = SMFMA4x4ReadVgprVALUWarWaitStates;
1813 break;
1814 case 8: NeedWaitStates = SMFMA16x16ReadVgprVALUWarWaitStates;
1815 break;
1816 case 16: LLVM_FALLTHROUGH;
1817 default: NeedWaitStates = SMFMA32x32ReadVgprVALUWarWaitStates;
1818 break;
1819 }
1820
1821 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceUse;
1822 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1823 }
1824
1825 return WaitStatesNeeded;
1826 }
1827
ShouldPreferAnother(SUnit * SU)1828 bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
1829 if (!SU->isInstr())
1830 return false;
1831
1832 const MachineInstr *MAI = nullptr;
1833 auto IsMFMAFn = [&MAI](const MachineInstr &MI) {
1834 MAI = nullptr;
1835 if (SIInstrInfo::isMAI(MI) &&
1836 MI.getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 &&
1837 MI.getOpcode() != AMDGPU::V_ACCVGPR_READ_B32_e64)
1838 MAI = &MI;
1839 return MAI != nullptr;
1840 };
1841
1842 MachineInstr *MI = SU->getInstr();
1843 if (IsMFMAFn(*MI)) {
1844 int W = getWaitStatesSince(IsMFMAFn, 16);
1845 if (MAI)
1846 return W < (int)TSchedModel.computeInstrLatency(MAI);
1847 }
1848
1849 return false;
1850 }
1851