1 //===-- GCNHazardRecognizers.cpp - GCN Hazard Recognizer Impls ------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This file implements hazard recognizers for scheduling on GCN processors.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "GCNHazardRecognizer.h"
14 #include "AMDGPUSubtarget.h"
15 #include "SIDefines.h"
16 #include "SIInstrInfo.h"
17 #include "SIRegisterInfo.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "Utils/AMDGPUBaseInfo.h"
20 #include "llvm/ADT/iterator_range.h"
21 #include "llvm/CodeGen/MachineFunction.h"
22 #include "llvm/CodeGen/MachineInstr.h"
23 #include "llvm/CodeGen/MachineInstrBuilder.h"
24 #include "llvm/CodeGen/MachineOperand.h"
25 #include "llvm/CodeGen/ScheduleDAG.h"
26 #include "llvm/MC/MCInstrDesc.h"
27 #include "llvm/Support/ErrorHandling.h"
28 #include <algorithm>
29 #include <cassert>
30 #include <limits>
31 #include <set>
32 #include <vector>
33
34 using namespace llvm;
35
36 //===----------------------------------------------------------------------===//
37 // Hazard Recoginizer Implementation
38 //===----------------------------------------------------------------------===//
39
GCNHazardRecognizer(const MachineFunction & MF)40 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
41 IsHazardRecognizerMode(false),
42 CurrCycleInstr(nullptr),
43 MF(MF),
44 ST(MF.getSubtarget<GCNSubtarget>()),
45 TII(*ST.getInstrInfo()),
46 TRI(TII.getRegisterInfo()),
47 ClauseUses(TRI.getNumRegUnits()),
48 ClauseDefs(TRI.getNumRegUnits()) {
49 MaxLookAhead = MF.getRegInfo().isPhysRegUsed(AMDGPU::AGPR0) ? 18 : 5;
50 TSchedModel.init(&ST);
51 }
52
EmitInstruction(SUnit * SU)53 void GCNHazardRecognizer::EmitInstruction(SUnit *SU) {
54 EmitInstruction(SU->getInstr());
55 }
56
EmitInstruction(MachineInstr * MI)57 void GCNHazardRecognizer::EmitInstruction(MachineInstr *MI) {
58 CurrCycleInstr = MI;
59 }
60
isDivFMas(unsigned Opcode)61 static bool isDivFMas(unsigned Opcode) {
62 return Opcode == AMDGPU::V_DIV_FMAS_F32 || Opcode == AMDGPU::V_DIV_FMAS_F64;
63 }
64
isSGetReg(unsigned Opcode)65 static bool isSGetReg(unsigned Opcode) {
66 return Opcode == AMDGPU::S_GETREG_B32;
67 }
68
isSSetReg(unsigned Opcode)69 static bool isSSetReg(unsigned Opcode) {
70 return Opcode == AMDGPU::S_SETREG_B32 || Opcode == AMDGPU::S_SETREG_IMM32_B32;
71 }
72
isRWLane(unsigned Opcode)73 static bool isRWLane(unsigned Opcode) {
74 return Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32;
75 }
76
isRFE(unsigned Opcode)77 static bool isRFE(unsigned Opcode) {
78 return Opcode == AMDGPU::S_RFE_B64;
79 }
80
isSMovRel(unsigned Opcode)81 static bool isSMovRel(unsigned Opcode) {
82 switch (Opcode) {
83 case AMDGPU::S_MOVRELS_B32:
84 case AMDGPU::S_MOVRELS_B64:
85 case AMDGPU::S_MOVRELD_B32:
86 case AMDGPU::S_MOVRELD_B64:
87 return true;
88 default:
89 return false;
90 }
91 }
92
isSendMsgTraceDataOrGDS(const SIInstrInfo & TII,const MachineInstr & MI)93 static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
94 const MachineInstr &MI) {
95 if (TII.isAlwaysGDS(MI.getOpcode()))
96 return true;
97
98 switch (MI.getOpcode()) {
99 case AMDGPU::S_SENDMSG:
100 case AMDGPU::S_SENDMSGHALT:
101 case AMDGPU::S_TTRACEDATA:
102 return true;
103 // These DS opcodes don't support GDS.
104 case AMDGPU::DS_NOP:
105 case AMDGPU::DS_PERMUTE_B32:
106 case AMDGPU::DS_BPERMUTE_B32:
107 return false;
108 default:
109 if (TII.isDS(MI.getOpcode())) {
110 int GDS = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
111 AMDGPU::OpName::gds);
112 if (MI.getOperand(GDS).getImm())
113 return true;
114 }
115 return false;
116 }
117 }
118
isPermlane(const MachineInstr & MI)119 static bool isPermlane(const MachineInstr &MI) {
120 unsigned Opcode = MI.getOpcode();
121 return Opcode == AMDGPU::V_PERMLANE16_B32 ||
122 Opcode == AMDGPU::V_PERMLANEX16_B32;
123 }
124
getHWReg(const SIInstrInfo * TII,const MachineInstr & RegInstr)125 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
126 const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
127 AMDGPU::OpName::simm16);
128 return RegOp->getImm() & AMDGPU::Hwreg::ID_MASK_;
129 }
130
131 ScheduleHazardRecognizer::HazardType
getHazardType(SUnit * SU,int Stalls)132 GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
133 MachineInstr *MI = SU->getInstr();
134 if (MI->isBundle())
135 return NoHazard;
136
137 if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0)
138 return NoopHazard;
139
140 // FIXME: Should flat be considered vmem?
141 if ((SIInstrInfo::isVMEM(*MI) ||
142 SIInstrInfo::isFLAT(*MI))
143 && checkVMEMHazards(MI) > 0)
144 return NoopHazard;
145
146 if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0)
147 return NoopHazard;
148
149 if (checkFPAtomicToDenormModeHazard(MI) > 0)
150 return NoopHazard;
151
152 if (ST.hasNoDataDepHazard())
153 return NoHazard;
154
155 if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0)
156 return NoopHazard;
157
158 if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0)
159 return NoopHazard;
160
161 if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0)
162 return NoopHazard;
163
164 if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0)
165 return NoopHazard;
166
167 if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0)
168 return NoopHazard;
169
170 if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0)
171 return NoopHazard;
172
173 if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
174 return NoopHazard;
175
176 if (ST.hasReadM0MovRelInterpHazard() &&
177 (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
178 checkReadM0Hazards(MI) > 0)
179 return NoopHazard;
180
181 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) &&
182 checkReadM0Hazards(MI) > 0)
183 return NoopHazard;
184
185 if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0)
186 return NoopHazard;
187
188 if ((MI->mayLoad() || MI->mayStore()) && checkMAILdStHazards(MI) > 0)
189 return NoopHazard;
190
191 if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0)
192 return NoopHazard;
193
194 if (checkAnyInstHazards(MI) > 0)
195 return NoopHazard;
196
197 return NoHazard;
198 }
199
insertNoopInBundle(MachineInstr * MI,const SIInstrInfo & TII)200 static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) {
201 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP))
202 .addImm(0);
203 }
204
processBundle()205 void GCNHazardRecognizer::processBundle() {
206 MachineBasicBlock::instr_iterator MI = std::next(CurrCycleInstr->getIterator());
207 MachineBasicBlock::instr_iterator E = CurrCycleInstr->getParent()->instr_end();
208 // Check bundled MachineInstr's for hazards.
209 for (; MI != E && MI->isInsideBundle(); ++MI) {
210 CurrCycleInstr = &*MI;
211 unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr);
212
213 if (IsHazardRecognizerMode)
214 fixHazards(CurrCycleInstr);
215
216 for (unsigned i = 0; i < WaitStates; ++i)
217 insertNoopInBundle(CurrCycleInstr, TII);
218
219 // It’s unnecessary to track more than MaxLookAhead instructions. Since we
220 // include the bundled MI directly after, only add a maximum of
221 // (MaxLookAhead - 1) noops to EmittedInstrs.
222 for (unsigned i = 0, e = std::min(WaitStates, MaxLookAhead - 1); i < e; ++i)
223 EmittedInstrs.push_front(nullptr);
224
225 EmittedInstrs.push_front(CurrCycleInstr);
226 EmittedInstrs.resize(MaxLookAhead);
227 }
228 CurrCycleInstr = nullptr;
229 }
230
PreEmitNoops(SUnit * SU)231 unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
232 IsHazardRecognizerMode = false;
233 return PreEmitNoopsCommon(SU->getInstr());
234 }
235
PreEmitNoops(MachineInstr * MI)236 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
237 IsHazardRecognizerMode = true;
238 CurrCycleInstr = MI;
239 unsigned W = PreEmitNoopsCommon(MI);
240 fixHazards(MI);
241 CurrCycleInstr = nullptr;
242 return W;
243 }
244
PreEmitNoopsCommon(MachineInstr * MI)245 unsigned GCNHazardRecognizer::PreEmitNoopsCommon(MachineInstr *MI) {
246 if (MI->isBundle())
247 return 0;
248
249 int WaitStates = std::max(0, checkAnyInstHazards(MI));
250
251 if (SIInstrInfo::isSMRD(*MI))
252 return std::max(WaitStates, checkSMRDHazards(MI));
253
254 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI))
255 WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
256
257 if (ST.hasNSAtoVMEMBug())
258 WaitStates = std::max(WaitStates, checkNSAtoVMEMHazard(MI));
259
260 WaitStates = std::max(WaitStates, checkFPAtomicToDenormModeHazard(MI));
261
262 if (ST.hasNoDataDepHazard())
263 return WaitStates;
264
265 if (SIInstrInfo::isVALU(*MI))
266 WaitStates = std::max(WaitStates, checkVALUHazards(MI));
267
268 if (SIInstrInfo::isDPP(*MI))
269 WaitStates = std::max(WaitStates, checkDPPHazards(MI));
270
271 if (isDivFMas(MI->getOpcode()))
272 WaitStates = std::max(WaitStates, checkDivFMasHazards(MI));
273
274 if (isRWLane(MI->getOpcode()))
275 WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
276
277 if (MI->isInlineAsm())
278 return std::max(WaitStates, checkInlineAsmHazards(MI));
279
280 if (isSGetReg(MI->getOpcode()))
281 return std::max(WaitStates, checkGetRegHazards(MI));
282
283 if (isSSetReg(MI->getOpcode()))
284 return std::max(WaitStates, checkSetRegHazards(MI));
285
286 if (isRFE(MI->getOpcode()))
287 return std::max(WaitStates, checkRFEHazards(MI));
288
289 if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) ||
290 isSMovRel(MI->getOpcode())))
291 return std::max(WaitStates, checkReadM0Hazards(MI));
292
293 if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI))
294 return std::max(WaitStates, checkReadM0Hazards(MI));
295
296 if (SIInstrInfo::isMAI(*MI))
297 return std::max(WaitStates, checkMAIHazards(MI));
298
299 if (MI->mayLoad() || MI->mayStore())
300 return std::max(WaitStates, checkMAILdStHazards(MI));
301
302 return WaitStates;
303 }
304
EmitNoop()305 void GCNHazardRecognizer::EmitNoop() {
306 EmittedInstrs.push_front(nullptr);
307 }
308
AdvanceCycle()309 void GCNHazardRecognizer::AdvanceCycle() {
310 // When the scheduler detects a stall, it will call AdvanceCycle() without
311 // emitting any instructions.
312 if (!CurrCycleInstr)
313 return;
314
315 // Do not track non-instructions which do not affect the wait states.
316 // If included, these instructions can lead to buffer overflow such that
317 // detectable hazards are missed.
318 if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() ||
319 CurrCycleInstr->isKill())
320 return;
321
322 if (CurrCycleInstr->isBundle()) {
323 processBundle();
324 return;
325 }
326
327 unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
328
329 // Keep track of emitted instructions
330 EmittedInstrs.push_front(CurrCycleInstr);
331
332 // Add a nullptr for each additional wait state after the first. Make sure
333 // not to add more than getMaxLookAhead() items to the list, since we
334 // truncate the list to that size right after this loop.
335 for (unsigned i = 1, e = std::min(NumWaitStates, getMaxLookAhead());
336 i < e; ++i) {
337 EmittedInstrs.push_front(nullptr);
338 }
339
340 // getMaxLookahead() is the largest number of wait states we will ever need
341 // to insert, so there is no point in keeping track of more than that many
342 // wait states.
343 EmittedInstrs.resize(getMaxLookAhead());
344
345 CurrCycleInstr = nullptr;
346 }
347
RecedeCycle()348 void GCNHazardRecognizer::RecedeCycle() {
349 llvm_unreachable("hazard recognizer does not support bottom-up scheduling.");
350 }
351
352 //===----------------------------------------------------------------------===//
353 // Helper Functions
354 //===----------------------------------------------------------------------===//
355
356 typedef function_ref<bool(MachineInstr *, int WaitStates)> IsExpiredFn;
357
358 // Returns a minimum wait states since \p I walking all predecessors.
359 // Only scans until \p IsExpired does not return true.
360 // Can only be run in a hazard recognizer mode.
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,MachineBasicBlock * MBB,MachineBasicBlock::reverse_instr_iterator I,int WaitStates,IsExpiredFn IsExpired,DenseSet<const MachineBasicBlock * > & Visited)361 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
362 MachineBasicBlock *MBB,
363 MachineBasicBlock::reverse_instr_iterator I,
364 int WaitStates,
365 IsExpiredFn IsExpired,
366 DenseSet<const MachineBasicBlock *> &Visited) {
367 for (auto E = MBB->instr_rend(); I != E; ++I) {
368 // Don't add WaitStates for parent BUNDLE instructions.
369 if (I->isBundle())
370 continue;
371
372 if (IsHazard(&*I))
373 return WaitStates;
374
375 if (I->isInlineAsm() || I->isImplicitDef() || I->isDebugInstr())
376 continue;
377
378 WaitStates += SIInstrInfo::getNumWaitStates(*I);
379
380 if (IsExpired(&*I, WaitStates))
381 return std::numeric_limits<int>::max();
382 }
383
384 int MinWaitStates = WaitStates;
385 bool Found = false;
386 for (MachineBasicBlock *Pred : MBB->predecessors()) {
387 if (!Visited.insert(Pred).second)
388 continue;
389
390 int W = getWaitStatesSince(IsHazard, Pred, Pred->instr_rbegin(),
391 WaitStates, IsExpired, Visited);
392
393 if (W == std::numeric_limits<int>::max())
394 continue;
395
396 MinWaitStates = Found ? std::min(MinWaitStates, W) : W;
397 if (IsExpired(nullptr, MinWaitStates))
398 return MinWaitStates;
399
400 Found = true;
401 }
402
403 if (Found)
404 return MinWaitStates;
405
406 return std::numeric_limits<int>::max();
407 }
408
getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,MachineInstr * MI,IsExpiredFn IsExpired)409 static int getWaitStatesSince(GCNHazardRecognizer::IsHazardFn IsHazard,
410 MachineInstr *MI,
411 IsExpiredFn IsExpired) {
412 DenseSet<const MachineBasicBlock *> Visited;
413 return getWaitStatesSince(IsHazard, MI->getParent(),
414 std::next(MI->getReverseIterator()),
415 0, IsExpired, Visited);
416 }
417
getWaitStatesSince(IsHazardFn IsHazard,int Limit)418 int GCNHazardRecognizer::getWaitStatesSince(IsHazardFn IsHazard, int Limit) {
419 if (IsHazardRecognizerMode) {
420 auto IsExpiredFn = [Limit] (MachineInstr *, int WaitStates) {
421 return WaitStates >= Limit;
422 };
423 return ::getWaitStatesSince(IsHazard, CurrCycleInstr, IsExpiredFn);
424 }
425
426 int WaitStates = 0;
427 for (MachineInstr *MI : EmittedInstrs) {
428 if (MI) {
429 if (IsHazard(MI))
430 return WaitStates;
431
432 if (MI->isInlineAsm())
433 continue;
434 }
435 ++WaitStates;
436
437 if (WaitStates >= Limit)
438 break;
439 }
440 return std::numeric_limits<int>::max();
441 }
442
getWaitStatesSinceDef(unsigned Reg,IsHazardFn IsHazardDef,int Limit)443 int GCNHazardRecognizer::getWaitStatesSinceDef(unsigned Reg,
444 IsHazardFn IsHazardDef,
445 int Limit) {
446 const SIRegisterInfo *TRI = ST.getRegisterInfo();
447
448 auto IsHazardFn = [IsHazardDef, TRI, Reg] (MachineInstr *MI) {
449 return IsHazardDef(MI) && MI->modifiesRegister(Reg, TRI);
450 };
451
452 return getWaitStatesSince(IsHazardFn, Limit);
453 }
454
getWaitStatesSinceSetReg(IsHazardFn IsHazard,int Limit)455 int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
456 int Limit) {
457 auto IsHazardFn = [IsHazard] (MachineInstr *MI) {
458 return isSSetReg(MI->getOpcode()) && IsHazard(MI);
459 };
460
461 return getWaitStatesSince(IsHazardFn, Limit);
462 }
463
464 //===----------------------------------------------------------------------===//
465 // No-op Hazard Detection
466 //===----------------------------------------------------------------------===//
467
addRegUnits(const SIRegisterInfo & TRI,BitVector & BV,unsigned Reg)468 static void addRegUnits(const SIRegisterInfo &TRI,
469 BitVector &BV, unsigned Reg) {
470 for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
471 BV.set(*RUI);
472 }
473
addRegsToSet(const SIRegisterInfo & TRI,iterator_range<MachineInstr::const_mop_iterator> Ops,BitVector & Set)474 static void addRegsToSet(const SIRegisterInfo &TRI,
475 iterator_range<MachineInstr::const_mop_iterator> Ops,
476 BitVector &Set) {
477 for (const MachineOperand &Op : Ops) {
478 if (Op.isReg())
479 addRegUnits(TRI, Set, Op.getReg());
480 }
481 }
482
addClauseInst(const MachineInstr & MI)483 void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
484 // XXX: Do we need to worry about implicit operands
485 addRegsToSet(TRI, MI.defs(), ClauseDefs);
486 addRegsToSet(TRI, MI.uses(), ClauseUses);
487 }
488
checkSoftClauseHazards(MachineInstr * MEM)489 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
490 // SMEM soft clause are only present on VI+, and only matter if xnack is
491 // enabled.
492 if (!ST.isXNACKEnabled())
493 return 0;
494
495 bool IsSMRD = TII.isSMRD(*MEM);
496
497 resetClause();
498
499 // A soft-clause is any group of consecutive SMEM instructions. The
500 // instructions in this group may return out of order and/or may be
501 // replayed (i.e. the same instruction issued more than once).
502 //
503 // In order to handle these situations correctly we need to make sure that
504 // when a clause has more than one instruction, no instruction in the clause
505 // writes to a register that is read by another instruction in the clause
506 // (including itself). If we encounter this situaion, we need to break the
507 // clause by inserting a non SMEM instruction.
508
509 for (MachineInstr *MI : EmittedInstrs) {
510 // When we hit a non-SMEM instruction then we have passed the start of the
511 // clause and we can stop.
512 if (!MI)
513 break;
514
515 if (IsSMRD != SIInstrInfo::isSMRD(*MI))
516 break;
517
518 addClauseInst(*MI);
519 }
520
521 if (ClauseDefs.none())
522 return 0;
523
524 // We need to make sure not to put loads and stores in the same clause if they
525 // use the same address. For now, just start a new clause whenever we see a
526 // store.
527 if (MEM->mayStore())
528 return 1;
529
530 addClauseInst(*MEM);
531
532 // If the set of defs and uses intersect then we cannot add this instruction
533 // to the clause, so we have a hazard.
534 return ClauseDefs.anyCommon(ClauseUses) ? 1 : 0;
535 }
536
checkSMRDHazards(MachineInstr * SMRD)537 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
538 int WaitStatesNeeded = 0;
539
540 WaitStatesNeeded = checkSoftClauseHazards(SMRD);
541
542 // This SMRD hazard only affects SI.
543 if (!ST.hasSMRDReadVALUDefHazard())
544 return WaitStatesNeeded;
545
546 // A read of an SGPR by SMRD instruction requires 4 wait states when the
547 // SGPR was written by a VALU instruction.
548 int SmrdSgprWaitStates = 4;
549 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
550 auto IsBufferHazardDefFn = [this] (MachineInstr *MI) { return TII.isSALU(*MI); };
551
552 bool IsBufferSMRD = TII.isBufferSMRD(*SMRD);
553
554 for (const MachineOperand &Use : SMRD->uses()) {
555 if (!Use.isReg())
556 continue;
557 int WaitStatesNeededForUse =
558 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
559 SmrdSgprWaitStates);
560 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
561
562 // This fixes what appears to be undocumented hardware behavior in SI where
563 // s_mov writing a descriptor and s_buffer_load_dword reading the descriptor
564 // needs some number of nops in between. We don't know how many we need, but
565 // let's use 4. This wasn't discovered before probably because the only
566 // case when this happens is when we expand a 64-bit pointer into a full
567 // descriptor and use s_buffer_load_dword instead of s_load_dword, which was
568 // probably never encountered in the closed-source land.
569 if (IsBufferSMRD) {
570 int WaitStatesNeededForUse =
571 SmrdSgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
572 IsBufferHazardDefFn,
573 SmrdSgprWaitStates);
574 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
575 }
576 }
577
578 return WaitStatesNeeded;
579 }
580
checkVMEMHazards(MachineInstr * VMEM)581 int GCNHazardRecognizer::checkVMEMHazards(MachineInstr* VMEM) {
582 if (!ST.hasVMEMReadSGPRVALUDefHazard())
583 return 0;
584
585 int WaitStatesNeeded = checkSoftClauseHazards(VMEM);
586
587 // A read of an SGPR by a VMEM instruction requires 5 wait states when the
588 // SGPR was written by a VALU Instruction.
589 const int VmemSgprWaitStates = 5;
590 auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
591 for (const MachineOperand &Use : VMEM->uses()) {
592 if (!Use.isReg() || TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
593 continue;
594
595 int WaitStatesNeededForUse =
596 VmemSgprWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardDefFn,
597 VmemSgprWaitStates);
598 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
599 }
600 return WaitStatesNeeded;
601 }
602
checkDPPHazards(MachineInstr * DPP)603 int GCNHazardRecognizer::checkDPPHazards(MachineInstr *DPP) {
604 const SIRegisterInfo *TRI = ST.getRegisterInfo();
605 const SIInstrInfo *TII = ST.getInstrInfo();
606
607 // Check for DPP VGPR read after VALU VGPR write and EXEC write.
608 int DppVgprWaitStates = 2;
609 int DppExecWaitStates = 5;
610 int WaitStatesNeeded = 0;
611 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
612
613 for (const MachineOperand &Use : DPP->uses()) {
614 if (!Use.isReg() || !TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
615 continue;
616 int WaitStatesNeededForUse =
617 DppVgprWaitStates - getWaitStatesSinceDef(Use.getReg(),
618 [](MachineInstr *) { return true; },
619 DppVgprWaitStates);
620 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
621 }
622
623 WaitStatesNeeded = std::max(
624 WaitStatesNeeded,
625 DppExecWaitStates - getWaitStatesSinceDef(AMDGPU::EXEC, IsHazardDefFn,
626 DppExecWaitStates));
627
628 return WaitStatesNeeded;
629 }
630
checkDivFMasHazards(MachineInstr * DivFMas)631 int GCNHazardRecognizer::checkDivFMasHazards(MachineInstr *DivFMas) {
632 const SIInstrInfo *TII = ST.getInstrInfo();
633
634 // v_div_fmas requires 4 wait states after a write to vcc from a VALU
635 // instruction.
636 const int DivFMasWaitStates = 4;
637 auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
638 int WaitStatesNeeded = getWaitStatesSinceDef(AMDGPU::VCC, IsHazardDefFn,
639 DivFMasWaitStates);
640
641 return DivFMasWaitStates - WaitStatesNeeded;
642 }
643
checkGetRegHazards(MachineInstr * GetRegInstr)644 int GCNHazardRecognizer::checkGetRegHazards(MachineInstr *GetRegInstr) {
645 const SIInstrInfo *TII = ST.getInstrInfo();
646 unsigned GetRegHWReg = getHWReg(TII, *GetRegInstr);
647
648 const int GetRegWaitStates = 2;
649 auto IsHazardFn = [TII, GetRegHWReg] (MachineInstr *MI) {
650 return GetRegHWReg == getHWReg(TII, *MI);
651 };
652 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, GetRegWaitStates);
653
654 return GetRegWaitStates - WaitStatesNeeded;
655 }
656
checkSetRegHazards(MachineInstr * SetRegInstr)657 int GCNHazardRecognizer::checkSetRegHazards(MachineInstr *SetRegInstr) {
658 const SIInstrInfo *TII = ST.getInstrInfo();
659 unsigned HWReg = getHWReg(TII, *SetRegInstr);
660
661 const int SetRegWaitStates = ST.getSetRegWaitStates();
662 auto IsHazardFn = [TII, HWReg] (MachineInstr *MI) {
663 return HWReg == getHWReg(TII, *MI);
664 };
665 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, SetRegWaitStates);
666 return SetRegWaitStates - WaitStatesNeeded;
667 }
668
createsVALUHazard(const MachineInstr & MI)669 int GCNHazardRecognizer::createsVALUHazard(const MachineInstr &MI) {
670 if (!MI.mayStore())
671 return -1;
672
673 const SIInstrInfo *TII = ST.getInstrInfo();
674 unsigned Opcode = MI.getOpcode();
675 const MCInstrDesc &Desc = MI.getDesc();
676
677 int VDataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
678 int VDataRCID = -1;
679 if (VDataIdx != -1)
680 VDataRCID = Desc.OpInfo[VDataIdx].RegClass;
681
682 if (TII->isMUBUF(MI) || TII->isMTBUF(MI)) {
683 // There is no hazard if the instruction does not use vector regs
684 // (like wbinvl1)
685 if (VDataIdx == -1)
686 return -1;
687 // For MUBUF/MTBUF instructions this hazard only exists if the
688 // instruction is not using a register in the soffset field.
689 const MachineOperand *SOffset =
690 TII->getNamedOperand(MI, AMDGPU::OpName::soffset);
691 // If we have no soffset operand, then assume this field has been
692 // hardcoded to zero.
693 if (AMDGPU::getRegBitWidth(VDataRCID) > 64 &&
694 (!SOffset || !SOffset->isReg()))
695 return VDataIdx;
696 }
697
698 // MIMG instructions create a hazard if they don't use a 256-bit T# and
699 // the store size is greater than 8 bytes and they have more than two bits
700 // of their dmask set.
701 // All our MIMG definitions use a 256-bit T#, so we can skip checking for them.
702 if (TII->isMIMG(MI)) {
703 int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
704 assert(SRsrcIdx != -1 &&
705 AMDGPU::getRegBitWidth(Desc.OpInfo[SRsrcIdx].RegClass) == 256);
706 (void)SRsrcIdx;
707 }
708
709 if (TII->isFLAT(MI)) {
710 int DataIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdata);
711 if (AMDGPU::getRegBitWidth(Desc.OpInfo[DataIdx].RegClass) > 64)
712 return DataIdx;
713 }
714
715 return -1;
716 }
717
checkVALUHazardsHelper(const MachineOperand & Def,const MachineRegisterInfo & MRI)718 int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
719 const MachineRegisterInfo &MRI) {
720 // Helper to check for the hazard where VMEM instructions that store more than
721 // 8 bytes can have there store data over written by the next instruction.
722 const SIRegisterInfo *TRI = ST.getRegisterInfo();
723
724 const int VALUWaitStates = 1;
725 int WaitStatesNeeded = 0;
726
727 if (!TRI->isVGPR(MRI, Def.getReg()))
728 return WaitStatesNeeded;
729 unsigned Reg = Def.getReg();
730 auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
731 int DataIdx = createsVALUHazard(*MI);
732 return DataIdx >= 0 &&
733 TRI->regsOverlap(MI->getOperand(DataIdx).getReg(), Reg);
734 };
735 int WaitStatesNeededForDef =
736 VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
737 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
738
739 return WaitStatesNeeded;
740 }
741
checkVALUHazards(MachineInstr * VALU)742 int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
743 // This checks for the hazard where VMEM instructions that store more than
744 // 8 bytes can have there store data over written by the next instruction.
745 if (!ST.has12DWordStoreHazard())
746 return 0;
747
748 const MachineRegisterInfo &MRI = MF.getRegInfo();
749 int WaitStatesNeeded = 0;
750
751 for (const MachineOperand &Def : VALU->defs()) {
752 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Def, MRI));
753 }
754
755 return WaitStatesNeeded;
756 }
757
checkInlineAsmHazards(MachineInstr * IA)758 int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
759 // This checks for hazards associated with inline asm statements.
760 // Since inline asms can contain just about anything, we use this
761 // to call/leverage other check*Hazard routines. Note that
762 // this function doesn't attempt to address all possible inline asm
763 // hazards (good luck), but is a collection of what has been
764 // problematic thus far.
765
766 // see checkVALUHazards()
767 if (!ST.has12DWordStoreHazard())
768 return 0;
769
770 const MachineRegisterInfo &MRI = MF.getRegInfo();
771 int WaitStatesNeeded = 0;
772
773 for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
774 I != E; ++I) {
775 const MachineOperand &Op = IA->getOperand(I);
776 if (Op.isReg() && Op.isDef()) {
777 WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
778 }
779 }
780
781 return WaitStatesNeeded;
782 }
783
checkRWLaneHazards(MachineInstr * RWLane)784 int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
785 const SIInstrInfo *TII = ST.getInstrInfo();
786 const SIRegisterInfo *TRI = ST.getRegisterInfo();
787 const MachineRegisterInfo &MRI = MF.getRegInfo();
788
789 const MachineOperand *LaneSelectOp =
790 TII->getNamedOperand(*RWLane, AMDGPU::OpName::src1);
791
792 if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
793 return 0;
794
795 unsigned LaneSelectReg = LaneSelectOp->getReg();
796 auto IsHazardFn = [TII] (MachineInstr *MI) {
797 return TII->isVALU(*MI);
798 };
799
800 const int RWLaneWaitStates = 4;
801 int WaitStatesSince = getWaitStatesSinceDef(LaneSelectReg, IsHazardFn,
802 RWLaneWaitStates);
803 return RWLaneWaitStates - WaitStatesSince;
804 }
805
checkRFEHazards(MachineInstr * RFE)806 int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
807 if (!ST.hasRFEHazards())
808 return 0;
809
810 const SIInstrInfo *TII = ST.getInstrInfo();
811
812 const int RFEWaitStates = 1;
813
814 auto IsHazardFn = [TII] (MachineInstr *MI) {
815 return getHWReg(TII, *MI) == AMDGPU::Hwreg::ID_TRAPSTS;
816 };
817 int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn, RFEWaitStates);
818 return RFEWaitStates - WaitStatesNeeded;
819 }
820
checkAnyInstHazards(MachineInstr * MI)821 int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
822 if (MI->isDebugInstr())
823 return 0;
824
825 const SIRegisterInfo *TRI = ST.getRegisterInfo();
826 if (!ST.hasSMovFedHazard())
827 return 0;
828
829 // Check for any instruction reading an SGPR after a write from
830 // s_mov_fed_b32.
831 int MovFedWaitStates = 1;
832 int WaitStatesNeeded = 0;
833
834 for (const MachineOperand &Use : MI->uses()) {
835 if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
836 continue;
837 auto IsHazardFn = [] (MachineInstr *MI) {
838 return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
839 };
840 int WaitStatesNeededForUse =
841 MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn,
842 MovFedWaitStates);
843 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
844 }
845
846 return WaitStatesNeeded;
847 }
848
checkReadM0Hazards(MachineInstr * MI)849 int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
850 const SIInstrInfo *TII = ST.getInstrInfo();
851 const int SMovRelWaitStates = 1;
852 auto IsHazardFn = [TII] (MachineInstr *MI) {
853 return TII->isSALU(*MI);
854 };
855 return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn,
856 SMovRelWaitStates);
857 }
858
fixHazards(MachineInstr * MI)859 void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
860 fixVMEMtoScalarWriteHazards(MI);
861 fixVcmpxPermlaneHazards(MI);
862 fixSMEMtoVectorWriteHazards(MI);
863 fixVcmpxExecWARHazard(MI);
864 fixLdsBranchVmemWARHazard(MI);
865 }
866
fixVcmpxPermlaneHazards(MachineInstr * MI)867 bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
868 if (!ST.hasVcmpxPermlaneHazard() || !isPermlane(*MI))
869 return false;
870
871 const SIInstrInfo *TII = ST.getInstrInfo();
872 auto IsHazardFn = [TII] (MachineInstr *MI) {
873 return TII->isVOPC(*MI);
874 };
875
876 auto IsExpiredFn = [] (MachineInstr *MI, int) {
877 if (!MI)
878 return false;
879 unsigned Opc = MI->getOpcode();
880 return SIInstrInfo::isVALU(*MI) &&
881 Opc != AMDGPU::V_NOP_e32 &&
882 Opc != AMDGPU::V_NOP_e64 &&
883 Opc != AMDGPU::V_NOP_sdwa;
884 };
885
886 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
887 std::numeric_limits<int>::max())
888 return false;
889
890 // V_NOP will be discarded by SQ.
891 // Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
892 // which is always a VGPR and available.
893 auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
894 unsigned Reg = Src0->getReg();
895 bool IsUndef = Src0->isUndef();
896 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
897 TII->get(AMDGPU::V_MOV_B32_e32))
898 .addReg(Reg, RegState::Define | (IsUndef ? RegState::Dead : 0))
899 .addReg(Reg, IsUndef ? RegState::Undef : RegState::Kill);
900
901 return true;
902 }
903
fixVMEMtoScalarWriteHazards(MachineInstr * MI)904 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
905 if (!ST.hasVMEMtoScalarWriteHazard())
906 return false;
907
908 if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
909 return false;
910
911 if (MI->getNumDefs() == 0)
912 return false;
913
914 const SIRegisterInfo *TRI = ST.getRegisterInfo();
915
916 auto IsHazardFn = [TRI, MI] (MachineInstr *I) {
917 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isDS(*I) &&
918 !SIInstrInfo::isFLAT(*I))
919 return false;
920
921 for (const MachineOperand &Def : MI->defs()) {
922 MachineOperand *Op = I->findRegisterUseOperand(Def.getReg(), false, TRI);
923 if (!Op)
924 continue;
925 return true;
926 }
927 return false;
928 };
929
930 auto IsExpiredFn = [] (MachineInstr *MI, int) {
931 return MI && (SIInstrInfo::isVALU(*MI) ||
932 (MI->getOpcode() == AMDGPU::S_WAITCNT &&
933 !MI->getOperand(0).getImm()));
934 };
935
936 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
937 std::numeric_limits<int>::max())
938 return false;
939
940 const SIInstrInfo *TII = ST.getInstrInfo();
941 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32));
942 return true;
943 }
944
fixSMEMtoVectorWriteHazards(MachineInstr * MI)945 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
946 if (!ST.hasSMEMtoVectorWriteHazard())
947 return false;
948
949 if (!SIInstrInfo::isVALU(*MI))
950 return false;
951
952 unsigned SDSTName;
953 switch (MI->getOpcode()) {
954 case AMDGPU::V_READLANE_B32:
955 case AMDGPU::V_READLANE_B32_gfx10:
956 case AMDGPU::V_READFIRSTLANE_B32:
957 SDSTName = AMDGPU::OpName::vdst;
958 break;
959 default:
960 SDSTName = AMDGPU::OpName::sdst;
961 break;
962 }
963
964 const SIInstrInfo *TII = ST.getInstrInfo();
965 const SIRegisterInfo *TRI = ST.getRegisterInfo();
966 const AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST.getCPU());
967 const MachineOperand *SDST = TII->getNamedOperand(*MI, SDSTName);
968 if (!SDST) {
969 for (const auto &MO : MI->implicit_operands()) {
970 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg()))) {
971 SDST = &MO;
972 break;
973 }
974 }
975 }
976
977 if (!SDST)
978 return false;
979
980 const unsigned SDSTReg = SDST->getReg();
981 auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
982 return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
983 };
984
985 auto IsExpiredFn = [TII, IV] (MachineInstr *MI, int) {
986 if (MI) {
987 if (TII->isSALU(*MI)) {
988 switch (MI->getOpcode()) {
989 case AMDGPU::S_SETVSKIP:
990 case AMDGPU::S_VERSION:
991 case AMDGPU::S_WAITCNT_VSCNT:
992 case AMDGPU::S_WAITCNT_VMCNT:
993 case AMDGPU::S_WAITCNT_EXPCNT:
994 // These instructions cannot not mitigate the hazard.
995 return false;
996 case AMDGPU::S_WAITCNT_LGKMCNT:
997 // Reducing lgkmcnt count to 0 always mitigates the hazard.
998 return (MI->getOperand(1).getImm() == 0) &&
999 (MI->getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1000 case AMDGPU::S_WAITCNT: {
1001 const int64_t Imm = MI->getOperand(0).getImm();
1002 AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
1003 return (Decoded.LgkmCnt == 0);
1004 }
1005 default:
1006 // SOPP instructions cannot mitigate the hazard.
1007 if (TII->isSOPP(*MI))
1008 return false;
1009 // At this point the SALU can be assumed to mitigate the hazard
1010 // because either:
1011 // (a) it is independent of the at risk SMEM (breaking chain),
1012 // or
1013 // (b) it is dependent on the SMEM, in which case an appropriate
1014 // s_waitcnt lgkmcnt _must_ exist between it and the at risk
1015 // SMEM instruction.
1016 return true;
1017 }
1018 }
1019 }
1020 return false;
1021 };
1022
1023 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1024 std::numeric_limits<int>::max())
1025 return false;
1026
1027 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1028 TII->get(AMDGPU::S_MOV_B32), AMDGPU::SGPR_NULL)
1029 .addImm(0);
1030 return true;
1031 }
1032
fixVcmpxExecWARHazard(MachineInstr * MI)1033 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
1034 if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
1035 return false;
1036
1037 const SIRegisterInfo *TRI = ST.getRegisterInfo();
1038 if (!MI->modifiesRegister(AMDGPU::EXEC, TRI))
1039 return false;
1040
1041 auto IsHazardFn = [TRI] (MachineInstr *I) {
1042 if (SIInstrInfo::isVALU(*I))
1043 return false;
1044 return I->readsRegister(AMDGPU::EXEC, TRI);
1045 };
1046
1047 const SIInstrInfo *TII = ST.getInstrInfo();
1048 auto IsExpiredFn = [TII, TRI] (MachineInstr *MI, int) {
1049 if (!MI)
1050 return false;
1051 if (SIInstrInfo::isVALU(*MI)) {
1052 if (TII->getNamedOperand(*MI, AMDGPU::OpName::sdst))
1053 return true;
1054 for (auto MO : MI->implicit_operands())
1055 if (MO.isDef() && TRI->isSGPRClass(TRI->getPhysRegClass(MO.getReg())))
1056 return true;
1057 }
1058 if (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
1059 (MI->getOperand(0).getImm() & 0xfffe) == 0xfffe)
1060 return true;
1061 return false;
1062 };
1063
1064 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1065 std::numeric_limits<int>::max())
1066 return false;
1067
1068 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1069 TII->get(AMDGPU::S_WAITCNT_DEPCTR))
1070 .addImm(0xfffe);
1071 return true;
1072 }
1073
fixLdsBranchVmemWARHazard(MachineInstr * MI)1074 bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
1075 if (!ST.hasLdsBranchVmemWARHazard())
1076 return false;
1077
1078 auto IsHazardInst = [] (const MachineInstr *MI) {
1079 if (SIInstrInfo::isDS(*MI))
1080 return 1;
1081 if (SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isSegmentSpecificFLAT(*MI))
1082 return 2;
1083 return 0;
1084 };
1085
1086 auto InstType = IsHazardInst(MI);
1087 if (!InstType)
1088 return false;
1089
1090 auto IsExpiredFn = [&IsHazardInst] (MachineInstr *I, int) {
1091 return I && (IsHazardInst(I) ||
1092 (I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1093 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1094 !I->getOperand(1).getImm()));
1095 };
1096
1097 auto IsHazardFn = [InstType, &IsHazardInst] (MachineInstr *I) {
1098 if (!I->isBranch())
1099 return false;
1100
1101 auto IsHazardFn = [InstType, IsHazardInst] (MachineInstr *I) {
1102 auto InstType2 = IsHazardInst(I);
1103 return InstType2 && InstType != InstType2;
1104 };
1105
1106 auto IsExpiredFn = [InstType, &IsHazardInst] (MachineInstr *I, int) {
1107 if (!I)
1108 return false;
1109
1110 auto InstType2 = IsHazardInst(I);
1111 if (InstType == InstType2)
1112 return true;
1113
1114 return I->getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1115 I->getOperand(0).getReg() == AMDGPU::SGPR_NULL &&
1116 !I->getOperand(1).getImm();
1117 };
1118
1119 return ::getWaitStatesSince(IsHazardFn, I, IsExpiredFn) !=
1120 std::numeric_limits<int>::max();
1121 };
1122
1123 if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
1124 std::numeric_limits<int>::max())
1125 return false;
1126
1127 const SIInstrInfo *TII = ST.getInstrInfo();
1128 BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
1129 TII->get(AMDGPU::S_WAITCNT_VSCNT))
1130 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1131 .addImm(0);
1132
1133 return true;
1134 }
1135
checkNSAtoVMEMHazard(MachineInstr * MI)1136 int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
1137 int NSAtoVMEMWaitStates = 1;
1138
1139 if (!ST.hasNSAtoVMEMBug())
1140 return 0;
1141
1142 if (!SIInstrInfo::isMUBUF(*MI) && !SIInstrInfo::isMTBUF(*MI))
1143 return 0;
1144
1145 const SIInstrInfo *TII = ST.getInstrInfo();
1146 const auto *Offset = TII->getNamedOperand(*MI, AMDGPU::OpName::offset);
1147 if (!Offset || (Offset->getImm() & 6) == 0)
1148 return 0;
1149
1150 auto IsHazardFn = [TII] (MachineInstr *I) {
1151 if (!SIInstrInfo::isMIMG(*I))
1152 return false;
1153 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(I->getOpcode());
1154 return Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA &&
1155 TII->getInstSizeInBytes(*I) >= 16;
1156 };
1157
1158 return NSAtoVMEMWaitStates - getWaitStatesSince(IsHazardFn, 1);
1159 }
1160
checkFPAtomicToDenormModeHazard(MachineInstr * MI)1161 int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
1162 int FPAtomicToDenormModeWaitStates = 3;
1163
1164 if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
1165 return 0;
1166
1167 auto IsHazardFn = [] (MachineInstr *I) {
1168 if (!SIInstrInfo::isVMEM(*I) && !SIInstrInfo::isFLAT(*I))
1169 return false;
1170 return SIInstrInfo::isFPAtomic(*I);
1171 };
1172
1173 auto IsExpiredFn = [] (MachineInstr *MI, int WaitStates) {
1174 if (WaitStates >= 3 || SIInstrInfo::isVALU(*MI))
1175 return true;
1176
1177 switch (MI->getOpcode()) {
1178 case AMDGPU::S_WAITCNT:
1179 case AMDGPU::S_WAITCNT_VSCNT:
1180 case AMDGPU::S_WAITCNT_VMCNT:
1181 case AMDGPU::S_WAITCNT_EXPCNT:
1182 case AMDGPU::S_WAITCNT_LGKMCNT:
1183 case AMDGPU::S_WAITCNT_IDLE:
1184 return true;
1185 default:
1186 break;
1187 }
1188
1189 return false;
1190 };
1191
1192
1193 return FPAtomicToDenormModeWaitStates -
1194 ::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn);
1195 }
1196
checkMAIHazards(MachineInstr * MI)1197 int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
1198 assert(SIInstrInfo::isMAI(*MI));
1199
1200 int WaitStatesNeeded = 0;
1201 unsigned Opc = MI->getOpcode();
1202
1203 auto IsVALUFn = [] (MachineInstr *MI) {
1204 return SIInstrInfo::isVALU(*MI);
1205 };
1206
1207 if (Opc != AMDGPU::V_ACCVGPR_READ_B32) { // MFMA or v_accvgpr_write
1208 const int LegacyVALUWritesVGPRWaitStates = 2;
1209 const int VALUWritesExecWaitStates = 4;
1210 const int MaxWaitStates = 4;
1211
1212 int WaitStatesNeededForUse = VALUWritesExecWaitStates -
1213 getWaitStatesSinceDef(AMDGPU::EXEC, IsVALUFn, MaxWaitStates);
1214 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1215
1216 if (WaitStatesNeeded < MaxWaitStates) {
1217 for (const MachineOperand &Use : MI->explicit_uses()) {
1218 const int MaxWaitStates = 2;
1219
1220 if (!Use.isReg() || !TRI.isVGPR(MF.getRegInfo(), Use.getReg()))
1221 continue;
1222
1223 int WaitStatesNeededForUse = LegacyVALUWritesVGPRWaitStates -
1224 getWaitStatesSinceDef(Use.getReg(), IsVALUFn, MaxWaitStates);
1225 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1226
1227 if (WaitStatesNeeded == MaxWaitStates)
1228 break;
1229 }
1230 }
1231 }
1232
1233 auto IsMFMAFn = [] (MachineInstr *MI) {
1234 return SIInstrInfo::isMAI(*MI) &&
1235 MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32 &&
1236 MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32;
1237 };
1238
1239 for (const MachineOperand &Op : MI->explicit_operands()) {
1240 if (!Op.isReg() || !TRI.isAGPR(MF.getRegInfo(), Op.getReg()))
1241 continue;
1242
1243 if (Op.isDef() && Opc != AMDGPU::V_ACCVGPR_WRITE_B32)
1244 continue;
1245
1246 const int MFMAWritesAGPROverlappedSrcABWaitStates = 4;
1247 const int MFMAWritesAGPROverlappedSrcCWaitStates = 2;
1248 const int MFMA4x4WritesAGPRAccVgprReadWaitStates = 4;
1249 const int MFMA16x16WritesAGPRAccVgprReadWaitStates = 10;
1250 const int MFMA32x32WritesAGPRAccVgprReadWaitStates = 18;
1251 const int MFMA4x4WritesAGPRAccVgprWriteWaitStates = 1;
1252 const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
1253 const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
1254 const int MaxWaitStates = 18;
1255 unsigned Reg = Op.getReg();
1256 unsigned HazardDefLatency = 0;
1257
1258 auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
1259 (MachineInstr *MI) {
1260 if (!IsMFMAFn(MI))
1261 return false;
1262 unsigned DstReg = MI->getOperand(0).getReg();
1263 if (DstReg == Reg)
1264 return false;
1265 HazardDefLatency = std::max(HazardDefLatency,
1266 TSchedModel.computeInstrLatency(MI));
1267 return TRI.regsOverlap(DstReg, Reg);
1268 };
1269
1270 int WaitStatesSinceDef = getWaitStatesSinceDef(Reg, IsOverlappedMFMAFn,
1271 MaxWaitStates);
1272 int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
1273 int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
1274 int OpNo = MI->getOperandNo(&Op);
1275 if (OpNo == SrcCIdx) {
1276 NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
1277 } else if (Opc == AMDGPU::V_ACCVGPR_READ_B32) {
1278 switch (HazardDefLatency) {
1279 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprReadWaitStates;
1280 break;
1281 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprReadWaitStates;
1282 break;
1283 case 16: LLVM_FALLTHROUGH;
1284 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprReadWaitStates;
1285 break;
1286 }
1287 } else if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1288 switch (HazardDefLatency) {
1289 case 2: NeedWaitStates = MFMA4x4WritesAGPRAccVgprWriteWaitStates;
1290 break;
1291 case 8: NeedWaitStates = MFMA16x16WritesAGPRAccVgprWriteWaitStates;
1292 break;
1293 case 16: LLVM_FALLTHROUGH;
1294 default: NeedWaitStates = MFMA32x32WritesAGPRAccVgprWriteWaitStates;
1295 break;
1296 }
1297 }
1298
1299 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSinceDef;
1300 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1301
1302 if (WaitStatesNeeded == MaxWaitStates)
1303 return WaitStatesNeeded; // Early exit.
1304
1305 auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
1306 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
1307 return false;
1308 unsigned DstReg = MI->getOperand(0).getReg();
1309 return TRI.regsOverlap(Reg, DstReg);
1310 };
1311
1312 const int AccVGPRWriteMFMAReadSrcCWaitStates = 1;
1313 const int AccVGPRWriteMFMAReadSrcABWaitStates = 3;
1314 const int AccVGPRWriteAccVgprReadWaitStates = 3;
1315 NeedWaitStates = AccVGPRWriteMFMAReadSrcABWaitStates;
1316 if (OpNo == SrcCIdx)
1317 NeedWaitStates = AccVGPRWriteMFMAReadSrcCWaitStates;
1318 else if (Opc == AMDGPU::V_ACCVGPR_READ_B32)
1319 NeedWaitStates = AccVGPRWriteAccVgprReadWaitStates;
1320
1321 WaitStatesNeededForUse = NeedWaitStates -
1322 getWaitStatesSinceDef(Reg, IsAccVgprWriteFn, MaxWaitStates);
1323 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1324
1325 if (WaitStatesNeeded == MaxWaitStates)
1326 return WaitStatesNeeded; // Early exit.
1327 }
1328
1329 if (Opc == AMDGPU::V_ACCVGPR_WRITE_B32) {
1330 const int MFMA4x4ReadSrcCAccVgprWriteWaitStates = 0;
1331 const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
1332 const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
1333 const int MaxWaitStates = 13;
1334 unsigned DstReg = MI->getOperand(0).getReg();
1335 unsigned HazardDefLatency = 0;
1336
1337 auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
1338 (MachineInstr *MI) {
1339 if (!IsMFMAFn(MI))
1340 return false;
1341 unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
1342 HazardDefLatency = std::max(HazardDefLatency,
1343 TSchedModel.computeInstrLatency(MI));
1344 return TRI.regsOverlap(Reg, DstReg);
1345 };
1346
1347 int WaitStatesSince = getWaitStatesSince(IsSrcCMFMAFn, MaxWaitStates);
1348 int NeedWaitStates;
1349 switch (HazardDefLatency) {
1350 case 2: NeedWaitStates = MFMA4x4ReadSrcCAccVgprWriteWaitStates;
1351 break;
1352 case 8: NeedWaitStates = MFMA16x16ReadSrcCAccVgprWriteWaitStates;
1353 break;
1354 case 16: LLVM_FALLTHROUGH;
1355 default: NeedWaitStates = MFMA32x32ReadSrcCAccVgprWriteWaitStates;
1356 break;
1357 }
1358
1359 int WaitStatesNeededForUse = NeedWaitStates - WaitStatesSince;
1360 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1361 }
1362
1363 return WaitStatesNeeded;
1364 }
1365
checkMAILdStHazards(MachineInstr * MI)1366 int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
1367 if (!ST.hasMAIInsts())
1368 return 0;
1369
1370 int WaitStatesNeeded = 0;
1371
1372 auto IsAccVgprReadFn = [] (MachineInstr *MI) {
1373 return MI->getOpcode() == AMDGPU::V_ACCVGPR_READ_B32;
1374 };
1375
1376 for (const MachineOperand &Op : MI->explicit_uses()) {
1377 if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
1378 continue;
1379
1380 unsigned Reg = Op.getReg();
1381
1382 const int AccVgprReadLdStWaitStates = 2;
1383 const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
1384 const int MaxWaitStates = 2;
1385
1386 int WaitStatesNeededForUse = AccVgprReadLdStWaitStates -
1387 getWaitStatesSinceDef(Reg, IsAccVgprReadFn, MaxWaitStates);
1388 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1389
1390 if (WaitStatesNeeded == MaxWaitStates)
1391 return WaitStatesNeeded; // Early exit.
1392
1393 auto IsVALUAccVgprReadCheckFn = [Reg, this] (MachineInstr *MI) {
1394 if (MI->getOpcode() != AMDGPU::V_ACCVGPR_READ_B32)
1395 return false;
1396 auto IsVALUFn = [] (MachineInstr *MI) {
1397 return SIInstrInfo::isVALU(*MI) && !SIInstrInfo::isMAI(*MI);
1398 };
1399 return getWaitStatesSinceDef(Reg, IsVALUFn, 2 /*MaxWaitStates*/) <
1400 std::numeric_limits<int>::max();
1401 };
1402
1403 WaitStatesNeededForUse = VALUWriteAccVgprReadLdStDepVALUWaitStates -
1404 getWaitStatesSince(IsVALUAccVgprReadCheckFn, MaxWaitStates);
1405 WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
1406 }
1407
1408 return WaitStatesNeeded;
1409 }
1410