1 //===- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// Insert wait instructions for memory reads and writes.
11 ///
12 /// Memory reads and writes are issued asynchronously, so we need to insert
13 /// S_WAITCNT instructions when we want to access any of their results or
14 /// overwrite any register that's used asynchronously.
15 ///
16 /// TODO: This pass currently keeps one timeline per hardware counter. A more
17 /// finely-grained approach that keeps one timeline per event type could
18 /// sometimes get away with generating weaker s_waitcnt instructions. For
19 /// example, when both SMEM and LDS are in flight and we need to wait for
20 /// the i-th-last LDS instruction, then an lgkmcnt(i) is actually sufficient,
21 /// but the pass will currently generate a conservative lgkmcnt(0) because
22 /// multiple event types are in flight.
23 //
24 //===----------------------------------------------------------------------===//
25
26 #include "AMDGPU.h"
27 #include "GCNSubtarget.h"
28 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
29 #include "SIMachineFunctionInfo.h"
30 #include "Utils/AMDGPUBaseInfo.h"
31 #include "llvm/ADT/MapVector.h"
32 #include "llvm/ADT/PostOrderIterator.h"
33 #include "llvm/ADT/Sequence.h"
34 #include "llvm/CodeGen/MachineLoopInfo.h"
35 #include "llvm/CodeGen/MachinePostDominators.h"
36 #include "llvm/InitializePasses.h"
37 #include "llvm/Support/DebugCounter.h"
38 #include "llvm/Support/TargetParser.h"
39 using namespace llvm;
40
41 #define DEBUG_TYPE "si-insert-waitcnts"
42
43 DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp",
44 "Force emit s_waitcnt expcnt(0) instrs");
45 DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm",
46 "Force emit s_waitcnt lgkmcnt(0) instrs");
47 DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm",
48 "Force emit s_waitcnt vmcnt(0) instrs");
49
50 static cl::opt<bool> ForceEmitZeroFlag(
51 "amdgpu-waitcnt-forcezero",
52 cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
53 cl::init(false), cl::Hidden);
54
55 namespace {
56 // Class of object that encapsulates latest instruction counter score
57 // associated with the operand. Used for determining whether
58 // s_waitcnt instruction needs to be emitted.
59
60 #define CNT_MASK(t) (1u << (t))
61
62 enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
63 } // namespace
64
65 namespace llvm {
66 template <> struct enum_iteration_traits<InstCounterType> {
67 static constexpr bool is_iterable = true;
68 };
69 } // namespace llvm
70
71 namespace {
inst_counter_types()72 auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
73
74 using RegInterval = std::pair<int, int>;
75
76 struct HardwareLimits {
77 unsigned VmcntMax;
78 unsigned ExpcntMax;
79 unsigned LgkmcntMax;
80 unsigned VscntMax;
81 };
82
83 struct RegisterEncoding {
84 unsigned VGPR0;
85 unsigned VGPRL;
86 unsigned SGPR0;
87 unsigned SGPRL;
88 };
89
90 enum WaitEventType {
91 VMEM_ACCESS, // vector-memory read & write
92 VMEM_READ_ACCESS, // vector-memory read
93 VMEM_WRITE_ACCESS, // vector-memory write
94 LDS_ACCESS, // lds read & write
95 GDS_ACCESS, // gds read & write
96 SQ_MESSAGE, // send message
97 SMEM_ACCESS, // scalar-memory read & write
98 EXP_GPR_LOCK, // export holding on its data src
99 GDS_GPR_LOCK, // GDS holding on its data and addr src
100 EXP_POS_ACCESS, // write to export position
101 EXP_PARAM_ACCESS, // write to export parameter
102 VMW_GPR_LOCK, // vector-memory write holding on its data src
103 EXP_LDS_ACCESS, // read by ldsdir counting as export
104 NUM_WAIT_EVENTS,
105 };
106
107 static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
108 (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
109 (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
110 (1 << SQ_MESSAGE),
111 (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
112 (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
113 (1 << VMEM_WRITE_ACCESS)};
114
115 // The mapping is:
116 // 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
117 // SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
118 // NUM_ALL_VGPRS .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
119 // We reserve a fixed number of VGPR slots in the scoring tables for
120 // special tokens like SCMEM_LDS (needed for buffer load to LDS).
121 enum RegisterMapping {
122 SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
123 AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
124 SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
125 NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
126 EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
127 NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
128 };
129
130 // Enumerate different types of result-returning VMEM operations. Although
131 // s_waitcnt orders them all with a single vmcnt counter, in the absence of
132 // s_waitcnt only instructions of the same VmemType are guaranteed to write
133 // their results in order -- so there is no need to insert an s_waitcnt between
134 // two instructions of the same type that write the same vgpr.
135 enum VmemType {
136 // BUF instructions and MIMG instructions without a sampler.
137 VMEM_NOSAMPLER,
138 // MIMG instructions with a sampler.
139 VMEM_SAMPLER,
140 // BVH instructions
141 VMEM_BVH
142 };
143
updateVMCntOnly(const MachineInstr & Inst)144 static bool updateVMCntOnly(const MachineInstr &Inst) {
145 return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
146 SIInstrInfo::isFLATScratch(Inst);
147 }
148
getVmemType(const MachineInstr & Inst)149 VmemType getVmemType(const MachineInstr &Inst) {
150 assert(updateVMCntOnly(Inst));
151 if (!SIInstrInfo::isMIMG(Inst))
152 return VMEM_NOSAMPLER;
153 const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
154 const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
155 AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
156 return BaseInfo->BVH ? VMEM_BVH
157 : BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
158 }
159
addWait(AMDGPU::Waitcnt & Wait,InstCounterType T,unsigned Count)160 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
161 switch (T) {
162 case VM_CNT:
163 Wait.VmCnt = std::min(Wait.VmCnt, Count);
164 break;
165 case EXP_CNT:
166 Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
167 break;
168 case LGKM_CNT:
169 Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
170 break;
171 case VS_CNT:
172 Wait.VsCnt = std::min(Wait.VsCnt, Count);
173 break;
174 default:
175 llvm_unreachable("bad InstCounterType");
176 }
177 }
178
179 // This objects maintains the current score brackets of each wait counter, and
180 // a per-register scoreboard for each wait counter.
181 //
182 // We also maintain the latest score for every event type that can change the
183 // waitcnt in order to know if there are multiple types of events within
184 // the brackets. When multiple types of event happen in the bracket,
185 // wait count may get decreased out of order, therefore we need to put in
186 // "s_waitcnt 0" before use.
187 class WaitcntBrackets {
188 public:
WaitcntBrackets(const GCNSubtarget * SubTarget,HardwareLimits Limits,RegisterEncoding Encoding)189 WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
190 RegisterEncoding Encoding)
191 : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
192
getWaitCountMax(InstCounterType T) const193 unsigned getWaitCountMax(InstCounterType T) const {
194 switch (T) {
195 case VM_CNT:
196 return Limits.VmcntMax;
197 case LGKM_CNT:
198 return Limits.LgkmcntMax;
199 case EXP_CNT:
200 return Limits.ExpcntMax;
201 case VS_CNT:
202 return Limits.VscntMax;
203 default:
204 break;
205 }
206 return 0;
207 }
208
getScoreLB(InstCounterType T) const209 unsigned getScoreLB(InstCounterType T) const {
210 assert(T < NUM_INST_CNTS);
211 return ScoreLBs[T];
212 }
213
getScoreUB(InstCounterType T) const214 unsigned getScoreUB(InstCounterType T) const {
215 assert(T < NUM_INST_CNTS);
216 return ScoreUBs[T];
217 }
218
getScoreRange(InstCounterType T) const219 unsigned getScoreRange(InstCounterType T) const {
220 return getScoreUB(T) - getScoreLB(T);
221 }
222
223 // Mapping from event to counter.
eventCounter(WaitEventType E) const224 InstCounterType eventCounter(WaitEventType E) const {
225 for (auto T : inst_counter_types()) {
226 if (WaitEventMaskForInst[T] & (1 << E))
227 return T;
228 }
229 llvm_unreachable("event type has no associated counter");
230 }
231
getRegScore(int GprNo,InstCounterType T) const232 unsigned getRegScore(int GprNo, InstCounterType T) const {
233 if (GprNo < NUM_ALL_VGPRS) {
234 return VgprScores[T][GprNo];
235 }
236 assert(T == LGKM_CNT);
237 return SgprScores[GprNo - NUM_ALL_VGPRS];
238 }
239
240 bool merge(const WaitcntBrackets &Other);
241
242 RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
243 const MachineRegisterInfo *MRI,
244 const SIRegisterInfo *TRI, unsigned OpNo) const;
245
246 bool counterOutOfOrder(InstCounterType T) const;
247 void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
248 void simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
249 void determineWait(InstCounterType T, int RegNo, AMDGPU::Waitcnt &Wait) const;
250 void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
251 void applyWaitcnt(InstCounterType T, unsigned Count);
252 void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
253 const MachineRegisterInfo *MRI, WaitEventType E,
254 MachineInstr &MI);
255
hasPendingEvent() const256 unsigned hasPendingEvent() const { return PendingEvents; }
hasPendingEvent(WaitEventType E) const257 unsigned hasPendingEvent(WaitEventType E) const {
258 return PendingEvents & (1 << E);
259 }
hasPendingEvent(InstCounterType T) const260 unsigned hasPendingEvent(InstCounterType T) const {
261 unsigned HasPending = PendingEvents & WaitEventMaskForInst[T];
262 assert((HasPending != 0) == (getScoreRange(T) != 0));
263 return HasPending;
264 }
265
hasMixedPendingEvents(InstCounterType T) const266 bool hasMixedPendingEvents(InstCounterType T) const {
267 unsigned Events = hasPendingEvent(T);
268 // Return true if more than one bit is set in Events.
269 return Events & (Events - 1);
270 }
271
hasPendingFlat() const272 bool hasPendingFlat() const {
273 return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
274 LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
275 (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
276 LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
277 }
278
setPendingFlat()279 void setPendingFlat() {
280 LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
281 LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
282 }
283
284 // Return true if there might be pending writes to the specified vgpr by VMEM
285 // instructions with types different from V.
hasOtherPendingVmemTypes(int GprNo,VmemType V) const286 bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
287 assert(GprNo < NUM_ALL_VGPRS);
288 return VgprVmemTypes[GprNo] & ~(1 << V);
289 }
290
clearVgprVmemTypes(int GprNo)291 void clearVgprVmemTypes(int GprNo) {
292 assert(GprNo < NUM_ALL_VGPRS);
293 VgprVmemTypes[GprNo] = 0;
294 }
295
296 void print(raw_ostream &);
dump()297 void dump() { print(dbgs()); }
298
299 private:
300 struct MergeInfo {
301 unsigned OldLB;
302 unsigned OtherLB;
303 unsigned MyShift;
304 unsigned OtherShift;
305 };
306 static bool mergeScore(const MergeInfo &M, unsigned &Score,
307 unsigned OtherScore);
308
setScoreLB(InstCounterType T,unsigned Val)309 void setScoreLB(InstCounterType T, unsigned Val) {
310 assert(T < NUM_INST_CNTS);
311 ScoreLBs[T] = Val;
312 }
313
setScoreUB(InstCounterType T,unsigned Val)314 void setScoreUB(InstCounterType T, unsigned Val) {
315 assert(T < NUM_INST_CNTS);
316 ScoreUBs[T] = Val;
317
318 if (T != EXP_CNT)
319 return;
320
321 if (getScoreRange(EXP_CNT) > getWaitCountMax(EXP_CNT))
322 ScoreLBs[EXP_CNT] = ScoreUBs[EXP_CNT] - getWaitCountMax(EXP_CNT);
323 }
324
setRegScore(int GprNo,InstCounterType T,unsigned Val)325 void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
326 if (GprNo < NUM_ALL_VGPRS) {
327 VgprUB = std::max(VgprUB, GprNo);
328 VgprScores[T][GprNo] = Val;
329 } else {
330 assert(T == LGKM_CNT);
331 SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
332 SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
333 }
334 }
335
336 void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
337 const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
338 unsigned OpNo, unsigned Val);
339
340 const GCNSubtarget *ST = nullptr;
341 HardwareLimits Limits = {};
342 RegisterEncoding Encoding = {};
343 unsigned ScoreLBs[NUM_INST_CNTS] = {0};
344 unsigned ScoreUBs[NUM_INST_CNTS] = {0};
345 unsigned PendingEvents = 0;
346 // Remember the last flat memory operation.
347 unsigned LastFlat[NUM_INST_CNTS] = {0};
348 // wait_cnt scores for every vgpr.
349 // Keep track of the VgprUB and SgprUB to make merge at join efficient.
350 int VgprUB = -1;
351 int SgprUB = -1;
352 unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
353 // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
354 unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
355 // Bitmask of the VmemTypes of VMEM instructions that might have a pending
356 // write to each vgpr.
357 unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
358 };
359
360 class SIInsertWaitcnts : public MachineFunctionPass {
361 private:
362 const GCNSubtarget *ST = nullptr;
363 const SIInstrInfo *TII = nullptr;
364 const SIRegisterInfo *TRI = nullptr;
365 const MachineRegisterInfo *MRI = nullptr;
366 AMDGPU::IsaVersion IV;
367
368 DenseSet<MachineInstr *> TrackedWaitcntSet;
369 DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
370 DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
371 MachineLoopInfo *MLI;
372 MachinePostDominatorTree *PDT;
373
374 struct BlockInfo {
375 MachineBasicBlock *MBB;
376 std::unique_ptr<WaitcntBrackets> Incoming;
377 bool Dirty = true;
378
BlockInfo__anon04c23efe0211::SIInsertWaitcnts::BlockInfo379 explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
380 };
381
382 MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
383
384 // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
385 // because of amdgpu-waitcnt-forcezero flag
386 bool ForceEmitZeroWaitcnts;
387 bool ForceEmitWaitcnt[NUM_INST_CNTS];
388
389 public:
390 static char ID;
391
SIInsertWaitcnts()392 SIInsertWaitcnts() : MachineFunctionPass(ID) {
393 (void)ForceExpCounter;
394 (void)ForceLgkmCounter;
395 (void)ForceVMCounter;
396 }
397
398 bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
399 bool isPreheaderToFlush(MachineBasicBlock &MBB,
400 WaitcntBrackets &ScoreBrackets);
401 bool runOnMachineFunction(MachineFunction &MF) override;
402
getPassName() const403 StringRef getPassName() const override {
404 return "SI insert wait instructions";
405 }
406
getAnalysisUsage(AnalysisUsage & AU) const407 void getAnalysisUsage(AnalysisUsage &AU) const override {
408 AU.setPreservesCFG();
409 AU.addRequired<MachineLoopInfo>();
410 AU.addRequired<MachinePostDominatorTree>();
411 MachineFunctionPass::getAnalysisUsage(AU);
412 }
413
isForceEmitWaitcnt() const414 bool isForceEmitWaitcnt() const {
415 for (auto T : inst_counter_types())
416 if (ForceEmitWaitcnt[T])
417 return true;
418 return false;
419 }
420
allZeroWaitcnt() const421 AMDGPU::Waitcnt allZeroWaitcnt() const {
422 return AMDGPU::Waitcnt::allZero(ST->hasVscnt());
423 }
424
setForceEmitWaitcnt()425 void setForceEmitWaitcnt() {
426 // For non-debug builds, ForceEmitWaitcnt has been initialized to false;
427 // For debug builds, get the debug counter info and adjust if need be
428 #ifndef NDEBUG
429 if (DebugCounter::isCounterSet(ForceExpCounter) &&
430 DebugCounter::shouldExecute(ForceExpCounter)) {
431 ForceEmitWaitcnt[EXP_CNT] = true;
432 } else {
433 ForceEmitWaitcnt[EXP_CNT] = false;
434 }
435
436 if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
437 DebugCounter::shouldExecute(ForceLgkmCounter)) {
438 ForceEmitWaitcnt[LGKM_CNT] = true;
439 } else {
440 ForceEmitWaitcnt[LGKM_CNT] = false;
441 }
442
443 if (DebugCounter::isCounterSet(ForceVMCounter) &&
444 DebugCounter::shouldExecute(ForceVMCounter)) {
445 ForceEmitWaitcnt[VM_CNT] = true;
446 } else {
447 ForceEmitWaitcnt[VM_CNT] = false;
448 }
449 #endif // NDEBUG
450 }
451
452 // Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
453 // FLAT instruction.
getVmemWaitEventType(const MachineInstr & Inst) const454 WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
455 assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
456 if (!ST->hasVscnt())
457 return VMEM_ACCESS;
458 if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst))
459 return VMEM_WRITE_ACCESS;
460 return VMEM_READ_ACCESS;
461 }
462
463 bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
464 bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
465 bool generateWaitcntInstBefore(MachineInstr &MI,
466 WaitcntBrackets &ScoreBrackets,
467 MachineInstr *OldWaitcntInstr,
468 bool FlushVmCnt);
469 bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
470 WaitcntBrackets &ScoreBrackets,
471 MachineInstr *OldWaitcntInstr);
472 bool generateWaitcnt(AMDGPU::Waitcnt Wait,
473 MachineBasicBlock::instr_iterator It,
474 MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
475 MachineInstr *OldWaitcntInstr);
476 void updateEventWaitcntAfter(MachineInstr &Inst,
477 WaitcntBrackets *ScoreBrackets);
478 bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
479 WaitcntBrackets &ScoreBrackets);
480 bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
481 MachineInstr &OldWaitcntInstr,
482 AMDGPU::Waitcnt &Wait,
483 MachineBasicBlock::instr_iterator It) const;
484 };
485
486 } // end anonymous namespace
487
getRegInterval(const MachineInstr * MI,const SIInstrInfo * TII,const MachineRegisterInfo * MRI,const SIRegisterInfo * TRI,unsigned OpNo) const488 RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
489 const SIInstrInfo *TII,
490 const MachineRegisterInfo *MRI,
491 const SIRegisterInfo *TRI,
492 unsigned OpNo) const {
493 const MachineOperand &Op = MI->getOperand(OpNo);
494 if (!TRI->isInAllocatableClass(Op.getReg()))
495 return {-1, -1};
496
497 // A use via a PW operand does not need a waitcnt.
498 // A partial write is not a WAW.
499 assert(!Op.getSubReg() || !Op.isUndef());
500
501 RegInterval Result;
502
503 unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
504
505 if (TRI->isVectorRegister(*MRI, Op.getReg())) {
506 assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
507 Result.first = Reg - Encoding.VGPR0;
508 if (TRI->isAGPR(*MRI, Op.getReg()))
509 Result.first += AGPR_OFFSET;
510 assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
511 } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
512 assert(Reg >= Encoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
513 Result.first = Reg - Encoding.SGPR0 + NUM_ALL_VGPRS;
514 assert(Result.first >= NUM_ALL_VGPRS &&
515 Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
516 }
517 // TODO: Handle TTMP
518 // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
519 else
520 return {-1, -1};
521
522 const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
523 unsigned Size = TRI->getRegSizeInBits(*RC);
524 Result.second = Result.first + ((Size + 16) / 32);
525
526 return Result;
527 }
528
setExpScore(const MachineInstr * MI,const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,unsigned OpNo,unsigned Val)529 void WaitcntBrackets::setExpScore(const MachineInstr *MI,
530 const SIInstrInfo *TII,
531 const SIRegisterInfo *TRI,
532 const MachineRegisterInfo *MRI, unsigned OpNo,
533 unsigned Val) {
534 RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
535 assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
536 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
537 setRegScore(RegNo, EXP_CNT, Val);
538 }
539 }
540
541 // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
542 // can be accessed. A load from LDS to VMEM does not need a wait.
mayWriteLDSThroughDMA(const MachineInstr & MI)543 static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
544 return SIInstrInfo::isVALU(MI) &&
545 (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
546 MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
547 }
548
updateByEvent(const SIInstrInfo * TII,const SIRegisterInfo * TRI,const MachineRegisterInfo * MRI,WaitEventType E,MachineInstr & Inst)549 void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
550 const SIRegisterInfo *TRI,
551 const MachineRegisterInfo *MRI,
552 WaitEventType E, MachineInstr &Inst) {
553 InstCounterType T = eventCounter(E);
554 unsigned CurrScore = getScoreUB(T) + 1;
555 if (CurrScore == 0)
556 report_fatal_error("InsertWaitcnt score wraparound");
557 // PendingEvents and ScoreUB need to be update regardless if this event
558 // changes the score of a register or not.
559 // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
560 PendingEvents |= 1 << E;
561 setScoreUB(T, CurrScore);
562
563 if (T == EXP_CNT) {
564 // Put score on the source vgprs. If this is a store, just use those
565 // specific register(s).
566 if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
567 int AddrOpIdx =
568 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr);
569 // All GDS operations must protect their address register (same as
570 // export.)
571 if (AddrOpIdx != -1) {
572 setExpScore(&Inst, TII, TRI, MRI, AddrOpIdx, CurrScore);
573 }
574
575 if (Inst.mayStore()) {
576 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data0)) {
577 setExpScore(
578 &Inst, TII, TRI, MRI,
579 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
580 CurrScore);
581 }
582 if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::data1)) {
583 setExpScore(&Inst, TII, TRI, MRI,
584 AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
585 AMDGPU::OpName::data1),
586 CurrScore);
587 }
588 } else if (SIInstrInfo::isAtomicRet(Inst) &&
589 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
590 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
591 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
592 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
593 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
594 Inst.getOpcode() != AMDGPU::DS_APPEND &&
595 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
596 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
597 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
598 const MachineOperand &Op = Inst.getOperand(I);
599 if (Op.isReg() && !Op.isDef() &&
600 TRI->isVectorRegister(*MRI, Op.getReg())) {
601 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
602 }
603 }
604 }
605 } else if (TII->isFLAT(Inst)) {
606 if (Inst.mayStore()) {
607 setExpScore(
608 &Inst, TII, TRI, MRI,
609 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
610 CurrScore);
611 } else if (SIInstrInfo::isAtomicRet(Inst)) {
612 setExpScore(
613 &Inst, TII, TRI, MRI,
614 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
615 CurrScore);
616 }
617 } else if (TII->isMIMG(Inst)) {
618 if (Inst.mayStore()) {
619 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
620 } else if (SIInstrInfo::isAtomicRet(Inst)) {
621 setExpScore(
622 &Inst, TII, TRI, MRI,
623 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
624 CurrScore);
625 }
626 } else if (TII->isMTBUF(Inst)) {
627 if (Inst.mayStore()) {
628 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
629 }
630 } else if (TII->isMUBUF(Inst)) {
631 if (Inst.mayStore()) {
632 setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
633 } else if (SIInstrInfo::isAtomicRet(Inst)) {
634 setExpScore(
635 &Inst, TII, TRI, MRI,
636 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
637 CurrScore);
638 }
639 } else if (TII->isLDSDIR(Inst)) {
640 // LDSDIR instructions attach the score to the destination.
641 setExpScore(
642 &Inst, TII, TRI, MRI,
643 AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::vdst),
644 CurrScore);
645 } else {
646 if (TII->isEXP(Inst)) {
647 // For export the destination registers are really temps that
648 // can be used as the actual source after export patching, so
649 // we need to treat them like sources and set the EXP_CNT
650 // score.
651 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
652 MachineOperand &DefMO = Inst.getOperand(I);
653 if (DefMO.isReg() && DefMO.isDef() &&
654 TRI->isVGPR(*MRI, DefMO.getReg())) {
655 setRegScore(
656 TRI->getEncodingValue(AMDGPU::getMCReg(DefMO.getReg(), *ST)),
657 EXP_CNT, CurrScore);
658 }
659 }
660 }
661 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
662 MachineOperand &MO = Inst.getOperand(I);
663 if (MO.isReg() && !MO.isDef() &&
664 TRI->isVectorRegister(*MRI, MO.getReg())) {
665 setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
666 }
667 }
668 }
669 #if 0 // TODO: check if this is handled by MUBUF code above.
670 } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
671 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
672 Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
673 MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
674 unsigned OpNo;//TODO: find the OpNo for this operand;
675 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
676 for (int RegNo = Interval.first; RegNo < Interval.second;
677 ++RegNo) {
678 setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
679 }
680 #endif
681 } else {
682 // Match the score to the destination registers.
683 for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
684 auto &Op = Inst.getOperand(I);
685 if (!Op.isReg() || !Op.isDef())
686 continue;
687 RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
688 if (T == VM_CNT) {
689 if (Interval.first >= NUM_ALL_VGPRS)
690 continue;
691 if (updateVMCntOnly(Inst)) {
692 VmemType V = getVmemType(Inst);
693 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
694 VgprVmemTypes[RegNo] |= 1 << V;
695 }
696 }
697 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
698 setRegScore(RegNo, T, CurrScore);
699 }
700 }
701 if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
702 setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
703 }
704 }
705 }
706
print(raw_ostream & OS)707 void WaitcntBrackets::print(raw_ostream &OS) {
708 OS << '\n';
709 for (auto T : inst_counter_types()) {
710 unsigned SR = getScoreRange(T);
711
712 switch (T) {
713 case VM_CNT:
714 OS << " VM_CNT(" << SR << "): ";
715 break;
716 case LGKM_CNT:
717 OS << " LGKM_CNT(" << SR << "): ";
718 break;
719 case EXP_CNT:
720 OS << " EXP_CNT(" << SR << "): ";
721 break;
722 case VS_CNT:
723 OS << " VS_CNT(" << SR << "): ";
724 break;
725 default:
726 OS << " UNKNOWN(" << SR << "): ";
727 break;
728 }
729
730 if (SR != 0) {
731 // Print vgpr scores.
732 unsigned LB = getScoreLB(T);
733
734 for (int J = 0; J <= VgprUB; J++) {
735 unsigned RegScore = getRegScore(J, T);
736 if (RegScore <= LB)
737 continue;
738 unsigned RelScore = RegScore - LB - 1;
739 if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
740 OS << RelScore << ":v" << J << " ";
741 } else {
742 OS << RelScore << ":ds ";
743 }
744 }
745 // Also need to print sgpr scores for lgkm_cnt.
746 if (T == LGKM_CNT) {
747 for (int J = 0; J <= SgprUB; J++) {
748 unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
749 if (RegScore <= LB)
750 continue;
751 unsigned RelScore = RegScore - LB - 1;
752 OS << RelScore << ":s" << J << " ";
753 }
754 }
755 }
756 OS << '\n';
757 }
758 OS << '\n';
759 }
760
761 /// Simplify the waitcnt, in the sense of removing redundant counts, and return
762 /// whether a waitcnt instruction is needed at all.
simplifyWaitcnt(AMDGPU::Waitcnt & Wait) const763 void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
764 simplifyWaitcnt(VM_CNT, Wait.VmCnt);
765 simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
766 simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
767 simplifyWaitcnt(VS_CNT, Wait.VsCnt);
768 }
769
simplifyWaitcnt(InstCounterType T,unsigned & Count) const770 void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
771 unsigned &Count) const {
772 // The number of outstanding events for this type, T, can be calculated
773 // as (UB - LB). If the current Count is greater than or equal to the number
774 // of outstanding events, then the wait for this counter is redundant.
775 if (Count >= getScoreRange(T))
776 Count = ~0u;
777 }
778
determineWait(InstCounterType T,int RegNo,AMDGPU::Waitcnt & Wait) const779 void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
780 AMDGPU::Waitcnt &Wait) const {
781 unsigned ScoreToWait = getRegScore(RegNo, T);
782
783 // If the score of src_operand falls within the bracket, we need an
784 // s_waitcnt instruction.
785 const unsigned LB = getScoreLB(T);
786 const unsigned UB = getScoreUB(T);
787 if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
788 if ((T == VM_CNT || T == LGKM_CNT) &&
789 hasPendingFlat() &&
790 !ST->hasFlatLgkmVMemCountInOrder()) {
791 // If there is a pending FLAT operation, and this is a VMem or LGKM
792 // waitcnt and the target can report early completion, then we need
793 // to force a waitcnt 0.
794 addWait(Wait, T, 0);
795 } else if (counterOutOfOrder(T)) {
796 // Counter can get decremented out-of-order when there
797 // are multiple types event in the bracket. Also emit an s_wait counter
798 // with a conservative value of 0 for the counter.
799 addWait(Wait, T, 0);
800 } else {
801 // If a counter has been maxed out avoid overflow by waiting for
802 // MAX(CounterType) - 1 instead.
803 unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
804 addWait(Wait, T, NeededWait);
805 }
806 }
807 }
808
applyWaitcnt(const AMDGPU::Waitcnt & Wait)809 void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
810 applyWaitcnt(VM_CNT, Wait.VmCnt);
811 applyWaitcnt(EXP_CNT, Wait.ExpCnt);
812 applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
813 applyWaitcnt(VS_CNT, Wait.VsCnt);
814 }
815
applyWaitcnt(InstCounterType T,unsigned Count)816 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
817 const unsigned UB = getScoreUB(T);
818 if (Count >= UB)
819 return;
820 if (Count != 0) {
821 if (counterOutOfOrder(T))
822 return;
823 setScoreLB(T, std::max(getScoreLB(T), UB - Count));
824 } else {
825 setScoreLB(T, UB);
826 PendingEvents &= ~WaitEventMaskForInst[T];
827 }
828 }
829
830 // Where there are multiple types of event in the bracket of a counter,
831 // the decrement may go out of order.
counterOutOfOrder(InstCounterType T) const832 bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
833 // Scalar memory read always can go out of order.
834 if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
835 return true;
836 return hasMixedPendingEvents(T);
837 }
838
839 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
840 false)
841 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
842 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
843 INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
844 false)
845
846 char SIInsertWaitcnts::ID = 0;
847
848 char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
849
createSIInsertWaitcntsPass()850 FunctionPass *llvm::createSIInsertWaitcntsPass() {
851 return new SIInsertWaitcnts();
852 }
853
updateOperandIfDifferent(MachineInstr & MI,uint16_t OpName,unsigned NewEnc)854 static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
855 unsigned NewEnc) {
856 int OpIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OpName);
857 assert(OpIdx >= 0);
858
859 MachineOperand &MO = MI.getOperand(OpIdx);
860
861 if (NewEnc == MO.getImm())
862 return false;
863
864 MO.setImm(NewEnc);
865 return true;
866 }
867
868 /// Combine consecutive waitcnt instructions that precede \p It and follow
869 /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
870 /// by previous passes. Currently this pass conservatively assumes that these
871 /// preexisting waitcnt are required for correctness.
applyPreexistingWaitcnt(WaitcntBrackets & ScoreBrackets,MachineInstr & OldWaitcntInstr,AMDGPU::Waitcnt & Wait,MachineBasicBlock::instr_iterator It) const872 bool SIInsertWaitcnts::applyPreexistingWaitcnt(
873 WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
874 AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
875 bool Modified = false;
876 MachineInstr *WaitcntInstr = nullptr;
877 MachineInstr *WaitcntVsCntInstr = nullptr;
878
879 for (auto &II :
880 make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
881 if (II.isMetaInstruction())
882 continue;
883
884 if (II.getOpcode() == AMDGPU::S_WAITCNT) {
885 // Conservatively update required wait if this waitcnt was added in an
886 // earlier pass. In this case it will not exist in the tracked waitcnt
887 // set.
888 if (!TrackedWaitcntSet.count(&II)) {
889 unsigned IEnc = II.getOperand(0).getImm();
890 AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
891 Wait = Wait.combined(OldWait);
892 }
893
894 // Merge consecutive waitcnt of the same type by erasing multiples.
895 if (!WaitcntInstr) {
896 WaitcntInstr = &II;
897 } else {
898 II.eraseFromParent();
899 Modified = true;
900 }
901
902 } else {
903 assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
904 assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
905 if (!TrackedWaitcntSet.count(&II)) {
906 unsigned OldVSCnt =
907 TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
908 Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
909 }
910
911 if (!WaitcntVsCntInstr) {
912 WaitcntVsCntInstr = &II;
913 } else {
914 II.eraseFromParent();
915 Modified = true;
916 }
917 }
918 }
919
920 // Updated encoding of merged waitcnt with the required wait.
921 if (WaitcntInstr) {
922 if (Wait.hasWaitExceptVsCnt()) {
923 Modified |=
924 updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
925 AMDGPU::encodeWaitcnt(IV, Wait));
926 ScoreBrackets.applyWaitcnt(Wait);
927 Wait.VmCnt = ~0u;
928 Wait.LgkmCnt = ~0u;
929 Wait.ExpCnt = ~0u;
930
931 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
932 ? dbgs() << "applyPreexistingWaitcnt\n"
933 << "New Instr at block end: " << *WaitcntInstr
934 << '\n'
935 : dbgs() << "applyPreexistingWaitcnt\n"
936 << "Old Instr: " << *It
937 << "New Instr: " << *WaitcntInstr << '\n');
938
939 } else {
940 WaitcntInstr->eraseFromParent();
941 Modified = true;
942 }
943 }
944
945 if (WaitcntVsCntInstr) {
946 if (Wait.hasWaitVsCnt()) {
947 assert(ST->hasVscnt());
948 Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
949 AMDGPU::OpName::simm16, Wait.VsCnt);
950 ScoreBrackets.applyWaitcnt(Wait);
951 Wait.VsCnt = ~0u;
952
953 LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
954 ? dbgs() << "applyPreexistingWaitcnt\n"
955 << "New Instr at block end: "
956 << *WaitcntVsCntInstr << '\n'
957 : dbgs() << "applyPreexistingWaitcnt\n"
958 << "Old Instr: " << *It
959 << "New Instr: " << *WaitcntVsCntInstr << '\n');
960 } else {
961 WaitcntVsCntInstr->eraseFromParent();
962 Modified = true;
963 }
964 }
965
966 return Modified;
967 }
968
readsVCCZ(const MachineInstr & MI)969 static bool readsVCCZ(const MachineInstr &MI) {
970 unsigned Opc = MI.getOpcode();
971 return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
972 !MI.getOperand(1).isUndef();
973 }
974
975 /// \returns true if the callee inserts an s_waitcnt 0 on function entry.
callWaitsOnFunctionEntry(const MachineInstr & MI)976 static bool callWaitsOnFunctionEntry(const MachineInstr &MI) {
977 // Currently all conventions wait, but this may not always be the case.
978 //
979 // TODO: If IPRA is enabled, and the callee is isSafeForNoCSROpt, it may make
980 // senses to omit the wait and do it in the caller.
981 return true;
982 }
983
984 /// \returns true if the callee is expected to wait for any outstanding waits
985 /// before returning.
callWaitsOnFunctionReturn(const MachineInstr & MI)986 static bool callWaitsOnFunctionReturn(const MachineInstr &MI) {
987 return true;
988 }
989
990 /// Generate s_waitcnt instruction to be placed before cur_Inst.
991 /// Instructions of a given type are returned in order,
992 /// but instructions of different types can complete out of order.
993 /// We rely on this in-order completion
994 /// and simply assign a score to the memory access instructions.
995 /// We keep track of the active "score bracket" to determine
996 /// if an access of a memory read requires an s_waitcnt
997 /// and if so what the value of each counter is.
998 /// The "score bracket" is bound by the lower bound and upper bound
999 /// scores (*_score_LB and *_score_ub respectively).
1000 /// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to
1001 /// flush the vmcnt counter here.
generateWaitcntInstBefore(MachineInstr & MI,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr,bool FlushVmCnt)1002 bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
1003 WaitcntBrackets &ScoreBrackets,
1004 MachineInstr *OldWaitcntInstr,
1005 bool FlushVmCnt) {
1006 setForceEmitWaitcnt();
1007
1008 if (MI.isMetaInstruction())
1009 return false;
1010
1011 AMDGPU::Waitcnt Wait;
1012
1013 // FIXME: This should have already been handled by the memory legalizer.
1014 // Removing this currently doesn't affect any lit tests, but we need to
1015 // verify that nothing was relying on this. The number of buffer invalidates
1016 // being handled here should not be expanded.
1017 if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
1018 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
1019 MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
1020 MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
1021 MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
1022 Wait.VmCnt = 0;
1023 }
1024
1025 // All waits must be resolved at call return.
1026 // NOTE: this could be improved with knowledge of all call sites or
1027 // with knowledge of the called routines.
1028 if (MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
1029 MI.getOpcode() == AMDGPU::SI_RETURN ||
1030 MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
1031 (MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
1032 Wait = Wait.combined(allZeroWaitcnt());
1033 }
1034 // Resolve vm waits before gs-done.
1035 else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
1036 MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
1037 ST->hasLegacyGeometry() &&
1038 ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
1039 AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
1040 Wait.VmCnt = 0;
1041 }
1042 #if 0 // TODO: the following blocks of logic when we have fence.
1043 else if (MI.getOpcode() == SC_FENCE) {
1044 const unsigned int group_size =
1045 context->shader_info->GetMaxThreadGroupSize();
1046 // group_size == 0 means thread group size is unknown at compile time
1047 const bool group_is_multi_wave =
1048 (group_size == 0 || group_size > target_info->GetWaveFrontSize());
1049 const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
1050
1051 for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
1052 SCRegType src_type = Inst->GetSrcType(i);
1053 switch (src_type) {
1054 case SCMEM_LDS:
1055 if (group_is_multi_wave ||
1056 context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
1057 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1058 ScoreBrackets->getScoreUB(LGKM_CNT));
1059 // LDS may have to wait for VM_CNT after buffer load to LDS
1060 if (target_info->HasBufferLoadToLDS()) {
1061 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1062 ScoreBrackets->getScoreUB(VM_CNT));
1063 }
1064 }
1065 break;
1066
1067 case SCMEM_GDS:
1068 if (group_is_multi_wave || fence_is_global) {
1069 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1070 ScoreBrackets->getScoreUB(EXP_CNT));
1071 EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
1072 ScoreBrackets->getScoreUB(LGKM_CNT));
1073 }
1074 break;
1075
1076 case SCMEM_UAV:
1077 case SCMEM_TFBUF:
1078 case SCMEM_RING:
1079 case SCMEM_SCATTER:
1080 if (group_is_multi_wave || fence_is_global) {
1081 EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
1082 ScoreBrackets->getScoreUB(EXP_CNT));
1083 EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
1084 ScoreBrackets->getScoreUB(VM_CNT));
1085 }
1086 break;
1087
1088 case SCMEM_SCRATCH:
1089 default:
1090 break;
1091 }
1092 }
1093 }
1094 #endif
1095
1096 // Export & GDS instructions do not read the EXEC mask until after the export
1097 // is granted (which can occur well after the instruction is issued).
1098 // The shader program must flush all EXP operations on the export-count
1099 // before overwriting the EXEC mask.
1100 else {
1101 if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
1102 // Export and GDS are tracked individually, either may trigger a waitcnt
1103 // for EXEC.
1104 if (ScoreBrackets.hasPendingEvent(EXP_GPR_LOCK) ||
1105 ScoreBrackets.hasPendingEvent(EXP_PARAM_ACCESS) ||
1106 ScoreBrackets.hasPendingEvent(EXP_POS_ACCESS) ||
1107 ScoreBrackets.hasPendingEvent(GDS_GPR_LOCK)) {
1108 Wait.ExpCnt = 0;
1109 }
1110 }
1111
1112 if (MI.isCall() && callWaitsOnFunctionEntry(MI)) {
1113 // The function is going to insert a wait on everything in its prolog.
1114 // This still needs to be careful if the call target is a load (e.g. a GOT
1115 // load). We also need to check WAW dependency with saved PC.
1116 Wait = AMDGPU::Waitcnt();
1117
1118 int CallAddrOpIdx =
1119 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
1120
1121 if (MI.getOperand(CallAddrOpIdx).isReg()) {
1122 RegInterval CallAddrOpInterval =
1123 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
1124
1125 for (int RegNo = CallAddrOpInterval.first;
1126 RegNo < CallAddrOpInterval.second; ++RegNo)
1127 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1128
1129 int RtnAddrOpIdx =
1130 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
1131 if (RtnAddrOpIdx != -1) {
1132 RegInterval RtnAddrOpInterval =
1133 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
1134
1135 for (int RegNo = RtnAddrOpInterval.first;
1136 RegNo < RtnAddrOpInterval.second; ++RegNo)
1137 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1138 }
1139 }
1140 } else {
1141 // FIXME: Should not be relying on memoperands.
1142 // Look at the source operands of every instruction to see if
1143 // any of them results from a previous memory operation that affects
1144 // its current usage. If so, an s_waitcnt instruction needs to be
1145 // emitted.
1146 // If the source operand was defined by a load, add the s_waitcnt
1147 // instruction.
1148 //
1149 // Two cases are handled for destination operands:
1150 // 1) If the destination operand was defined by a load, add the s_waitcnt
1151 // instruction to guarantee the right WAW order.
1152 // 2) If a destination operand that was used by a recent export/store ins,
1153 // add s_waitcnt on exp_cnt to guarantee the WAR order.
1154 for (const MachineMemOperand *Memop : MI.memoperands()) {
1155 const Value *Ptr = Memop->getValue();
1156 if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
1157 addWait(Wait, LGKM_CNT, 0);
1158 if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
1159 SLoadAddresses.erase(Ptr);
1160 }
1161 unsigned AS = Memop->getAddrSpace();
1162 if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
1163 continue;
1164 // No need to wait before load from VMEM to LDS.
1165 if (mayWriteLDSThroughDMA(MI))
1166 continue;
1167 unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
1168 // VM_CNT is only relevant to vgpr or LDS.
1169 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1170 if (Memop->isStore()) {
1171 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1172 }
1173 }
1174
1175 // Loop over use and def operands.
1176 for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
1177 MachineOperand &Op = MI.getOperand(I);
1178 if (!Op.isReg())
1179 continue;
1180
1181 // If the instruction does not read tied source, skip the operand.
1182 if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
1183 continue;
1184
1185 RegInterval Interval =
1186 ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
1187
1188 const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
1189 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1190 if (IsVGPR) {
1191 // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
1192 // previous write and this write are the same type of VMEM
1193 // instruction, in which case they're guaranteed to write their
1194 // results in order anyway.
1195 if (Op.isUse() || !updateVMCntOnly(MI) ||
1196 ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
1197 getVmemType(MI))) {
1198 ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
1199 ScoreBrackets.clearVgprVmemTypes(RegNo);
1200 }
1201 if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
1202 ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
1203 }
1204 }
1205 ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
1206 }
1207 }
1208 }
1209 }
1210
1211 // The subtarget may have an implicit S_WAITCNT 0 before barriers. If it does
1212 // not, we need to ensure the subtarget is capable of backing off barrier
1213 // instructions in case there are any outstanding memory operations that may
1214 // cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
1215 if (MI.getOpcode() == AMDGPU::S_BARRIER &&
1216 !ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
1217 Wait = Wait.combined(allZeroWaitcnt());
1218 }
1219
1220 // TODO: Remove this work-around, enable the assert for Bug 457939
1221 // after fixing the scheduler. Also, the Shader Compiler code is
1222 // independent of target.
1223 if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
1224 if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1225 Wait.LgkmCnt = 0;
1226 }
1227 }
1228
1229 // Verify that the wait is actually needed.
1230 ScoreBrackets.simplifyWaitcnt(Wait);
1231
1232 if (ForceEmitZeroWaitcnts)
1233 Wait = allZeroWaitcnt();
1234
1235 if (ForceEmitWaitcnt[VM_CNT])
1236 Wait.VmCnt = 0;
1237 if (ForceEmitWaitcnt[EXP_CNT])
1238 Wait.ExpCnt = 0;
1239 if (ForceEmitWaitcnt[LGKM_CNT])
1240 Wait.LgkmCnt = 0;
1241 if (ForceEmitWaitcnt[VS_CNT])
1242 Wait.VsCnt = 0;
1243
1244 if (FlushVmCnt) {
1245 if (ScoreBrackets.hasPendingEvent(VM_CNT))
1246 Wait.VmCnt = 0;
1247 }
1248
1249 return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
1250 OldWaitcntInstr);
1251 }
1252
1253 // Add a waitcnt to flush the vmcnt counter at the end of the given block if
1254 // needed.
generateWaitcntBlockEnd(MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr)1255 bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1256 WaitcntBrackets &ScoreBrackets,
1257 MachineInstr *OldWaitcntInstr) {
1258 AMDGPU::Waitcnt Wait;
1259
1260 if (!ScoreBrackets.hasPendingEvent(VM_CNT))
1261 return false;
1262
1263 Wait.VmCnt = 0;
1264
1265 return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1266 OldWaitcntInstr);
1267 }
1268
generateWaitcnt(AMDGPU::Waitcnt Wait,MachineBasicBlock::instr_iterator It,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets,MachineInstr * OldWaitcntInstr)1269 bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
1270 MachineBasicBlock::instr_iterator It,
1271 MachineBasicBlock &Block,
1272 WaitcntBrackets &ScoreBrackets,
1273 MachineInstr *OldWaitcntInstr) {
1274 bool Modified = false;
1275 const DebugLoc &DL = Block.findDebugLoc(It);
1276
1277 if (OldWaitcntInstr)
1278 // Try to merge the required wait with preexisting waitcnt instructions.
1279 // Also erase redundant waitcnt.
1280 Modified =
1281 applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
1282 else
1283 ScoreBrackets.applyWaitcnt(Wait);
1284
1285 // ExpCnt can be merged into VINTERP.
1286 if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
1287 SIInstrInfo::isVINTERP(*It)) {
1288 MachineOperand *WaitExp =
1289 TII->getNamedOperand(*It, AMDGPU::OpName::waitexp);
1290 if (Wait.ExpCnt < WaitExp->getImm()) {
1291 WaitExp->setImm(Wait.ExpCnt);
1292 Modified = true;
1293 }
1294 Wait.ExpCnt = ~0u;
1295
1296 LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
1297 << "Update Instr: " << *It);
1298 }
1299
1300 // Build new waitcnt instructions unless no wait is needed or the old waitcnt
1301 // instruction was modified to handle the required wait.
1302 if (Wait.hasWaitExceptVsCnt()) {
1303 unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
1304 auto SWaitInst =
1305 BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
1306 TrackedWaitcntSet.insert(SWaitInst);
1307 Modified = true;
1308
1309 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1310 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1311 dbgs() << "New Instr: " << *SWaitInst << '\n');
1312 }
1313
1314 if (Wait.hasWaitVsCnt()) {
1315 assert(ST->hasVscnt());
1316
1317 auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
1318 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1319 .addImm(Wait.VsCnt);
1320 TrackedWaitcntSet.insert(SWaitInst);
1321 Modified = true;
1322
1323 LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
1324 if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
1325 dbgs() << "New Instr: " << *SWaitInst << '\n');
1326 }
1327 return Modified;
1328 }
1329
1330 // This is a flat memory operation. Check to see if it has memory tokens other
1331 // than LDS. Other address spaces supported by flat memory operations involve
1332 // global memory.
mayAccessVMEMThroughFlat(const MachineInstr & MI) const1333 bool SIInsertWaitcnts::mayAccessVMEMThroughFlat(const MachineInstr &MI) const {
1334 assert(TII->isFLAT(MI));
1335
1336 // All flat instructions use the VMEM counter.
1337 assert(TII->usesVM_CNT(MI));
1338
1339 // If there are no memory operands then conservatively assume the flat
1340 // operation may access VMEM.
1341 if (MI.memoperands_empty())
1342 return true;
1343
1344 // See if any memory operand specifies an address space that involves VMEM.
1345 // Flat operations only supported FLAT, LOCAL (LDS), or address spaces
1346 // involving VMEM such as GLOBAL, CONSTANT, PRIVATE (SCRATCH), etc. The REGION
1347 // (GDS) address space is not supported by flat operations. Therefore, simply
1348 // return true unless only the LDS address space is found.
1349 for (const MachineMemOperand *Memop : MI.memoperands()) {
1350 unsigned AS = Memop->getAddrSpace();
1351 assert(AS != AMDGPUAS::REGION_ADDRESS);
1352 if (AS != AMDGPUAS::LOCAL_ADDRESS)
1353 return true;
1354 }
1355
1356 return false;
1357 }
1358
1359 // This is a flat memory operation. Check to see if it has memory tokens for
1360 // either LDS or FLAT.
mayAccessLDSThroughFlat(const MachineInstr & MI) const1361 bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
1362 assert(TII->isFLAT(MI));
1363
1364 // Flat instruction such as SCRATCH and GLOBAL do not use the lgkm counter.
1365 if (!TII->usesLGKM_CNT(MI))
1366 return false;
1367
1368 // If in tgsplit mode then there can be no use of LDS.
1369 if (ST->isTgSplitEnabled())
1370 return false;
1371
1372 // If there are no memory operands then conservatively assume the flat
1373 // operation may access LDS.
1374 if (MI.memoperands_empty())
1375 return true;
1376
1377 // See if any memory operand specifies an address space that involves LDS.
1378 for (const MachineMemOperand *Memop : MI.memoperands()) {
1379 unsigned AS = Memop->getAddrSpace();
1380 if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS)
1381 return true;
1382 }
1383
1384 return false;
1385 }
1386
updateEventWaitcntAfter(MachineInstr & Inst,WaitcntBrackets * ScoreBrackets)1387 void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
1388 WaitcntBrackets *ScoreBrackets) {
1389 // Now look at the instruction opcode. If it is a memory access
1390 // instruction, update the upper-bound of the appropriate counter's
1391 // bracket and the destination operand scores.
1392 // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
1393 if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
1394 if (TII->isAlwaysGDS(Inst.getOpcode()) ||
1395 TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
1396 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
1397 ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
1398 } else {
1399 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1400 }
1401 } else if (TII->isFLAT(Inst)) {
1402 assert(Inst.mayLoadOrStore());
1403
1404 int FlatASCount = 0;
1405
1406 if (mayAccessVMEMThroughFlat(Inst)) {
1407 ++FlatASCount;
1408 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1409 Inst);
1410 }
1411
1412 if (mayAccessLDSThroughFlat(Inst)) {
1413 ++FlatASCount;
1414 ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
1415 }
1416
1417 // A Flat memory operation must access at least one address space.
1418 assert(FlatASCount);
1419
1420 // This is a flat memory operation that access both VMEM and LDS, so note it
1421 // - it will require that both the VM and LGKM be flushed to zero if it is
1422 // pending when a VM or LGKM dependency occurs.
1423 if (FlatASCount > 1)
1424 ScoreBrackets->setPendingFlat();
1425 } else if (SIInstrInfo::isVMEM(Inst) &&
1426 !llvm::AMDGPU::getMUBUFIsBufferInv(Inst.getOpcode())) {
1427 ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
1428 Inst);
1429
1430 if (ST->vmemWriteNeedsExpWaitcnt() &&
1431 (Inst.mayStore() || SIInstrInfo::isAtomicRet(Inst))) {
1432 ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
1433 }
1434 } else if (TII->isSMRD(Inst)) {
1435 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1436 } else if (Inst.isCall()) {
1437 if (callWaitsOnFunctionReturn(Inst)) {
1438 // Act as a wait on everything
1439 ScoreBrackets->applyWaitcnt(allZeroWaitcnt());
1440 } else {
1441 // May need to way wait for anything.
1442 ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
1443 }
1444 } else if (SIInstrInfo::isLDSDIR(Inst)) {
1445 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_LDS_ACCESS, Inst);
1446 } else if (TII->isVINTERP(Inst)) {
1447 int64_t Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::waitexp)->getImm();
1448 ScoreBrackets->applyWaitcnt(EXP_CNT, Imm);
1449 } else if (SIInstrInfo::isEXP(Inst)) {
1450 unsigned Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
1451 if (Imm >= AMDGPU::Exp::ET_PARAM0 && Imm <= AMDGPU::Exp::ET_PARAM31)
1452 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
1453 else if (Imm >= AMDGPU::Exp::ET_POS0 && Imm <= AMDGPU::Exp::ET_POS_LAST)
1454 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
1455 else
1456 ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
1457 } else {
1458 switch (Inst.getOpcode()) {
1459 case AMDGPU::S_SENDMSG:
1460 case AMDGPU::S_SENDMSG_RTN_B32:
1461 case AMDGPU::S_SENDMSG_RTN_B64:
1462 case AMDGPU::S_SENDMSGHALT:
1463 ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
1464 break;
1465 case AMDGPU::S_MEMTIME:
1466 case AMDGPU::S_MEMREALTIME:
1467 ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
1468 break;
1469 }
1470 }
1471 }
1472
mergeScore(const MergeInfo & M,unsigned & Score,unsigned OtherScore)1473 bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
1474 unsigned OtherScore) {
1475 unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
1476 unsigned OtherShifted =
1477 OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
1478 Score = std::max(MyShifted, OtherShifted);
1479 return OtherShifted > MyShifted;
1480 }
1481
1482 /// Merge the pending events and associater score brackets of \p Other into
1483 /// this brackets status.
1484 ///
1485 /// Returns whether the merge resulted in a change that requires tighter waits
1486 /// (i.e. the merged brackets strictly dominate the original brackets).
merge(const WaitcntBrackets & Other)1487 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
1488 bool StrictDom = false;
1489
1490 VgprUB = std::max(VgprUB, Other.VgprUB);
1491 SgprUB = std::max(SgprUB, Other.SgprUB);
1492
1493 for (auto T : inst_counter_types()) {
1494 // Merge event flags for this counter
1495 const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
1496 const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
1497 if (OtherEvents & ~OldEvents)
1498 StrictDom = true;
1499 PendingEvents |= OtherEvents;
1500
1501 // Merge scores for this counter
1502 const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
1503 const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
1504 const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
1505 if (NewUB < ScoreLBs[T])
1506 report_fatal_error("waitcnt score overflow");
1507
1508 MergeInfo M;
1509 M.OldLB = ScoreLBs[T];
1510 M.OtherLB = Other.ScoreLBs[T];
1511 M.MyShift = NewUB - ScoreUBs[T];
1512 M.OtherShift = NewUB - Other.ScoreUBs[T];
1513
1514 ScoreUBs[T] = NewUB;
1515
1516 StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
1517
1518 for (int J = 0; J <= VgprUB; J++)
1519 StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
1520
1521 if (T == LGKM_CNT) {
1522 for (int J = 0; J <= SgprUB; J++)
1523 StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
1524 }
1525 }
1526
1527 for (int J = 0; J <= VgprUB; J++) {
1528 unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
1529 StrictDom |= NewVmemTypes != VgprVmemTypes[J];
1530 VgprVmemTypes[J] = NewVmemTypes;
1531 }
1532
1533 return StrictDom;
1534 }
1535
isWaitInstr(MachineInstr & Inst)1536 static bool isWaitInstr(MachineInstr &Inst) {
1537 return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
1538 (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
1539 Inst.getOperand(0).isReg() &&
1540 Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
1541 }
1542
1543 // Generate s_waitcnt instructions where needed.
insertWaitcntInBlock(MachineFunction & MF,MachineBasicBlock & Block,WaitcntBrackets & ScoreBrackets)1544 bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
1545 MachineBasicBlock &Block,
1546 WaitcntBrackets &ScoreBrackets) {
1547 bool Modified = false;
1548
1549 LLVM_DEBUG({
1550 dbgs() << "*** Block" << Block.getNumber() << " ***";
1551 ScoreBrackets.dump();
1552 });
1553
1554 // Track the correctness of vccz through this basic block. There are two
1555 // reasons why it might be incorrect; see ST->hasReadVCCZBug() and
1556 // ST->partialVCCWritesUpdateVCCZ().
1557 bool VCCZCorrect = true;
1558 if (ST->hasReadVCCZBug()) {
1559 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1560 // to vcc and then issued an smem load.
1561 VCCZCorrect = false;
1562 } else if (!ST->partialVCCWritesUpdateVCCZ()) {
1563 // vccz could be incorrect at a basic block boundary if a predecessor wrote
1564 // to vcc_lo or vcc_hi.
1565 VCCZCorrect = false;
1566 }
1567
1568 // Walk over the instructions.
1569 MachineInstr *OldWaitcntInstr = nullptr;
1570
1571 for (MachineBasicBlock::instr_iterator Iter = Block.instr_begin(),
1572 E = Block.instr_end();
1573 Iter != E;) {
1574 MachineInstr &Inst = *Iter;
1575
1576 // Track pre-existing waitcnts that were added in earlier iterations or by
1577 // the memory legalizer.
1578 if (isWaitInstr(Inst)) {
1579 if (!OldWaitcntInstr)
1580 OldWaitcntInstr = &Inst;
1581 ++Iter;
1582 continue;
1583 }
1584
1585 bool FlushVmCnt = Block.getFirstTerminator() == Inst &&
1586 isPreheaderToFlush(Block, ScoreBrackets);
1587
1588 // Generate an s_waitcnt instruction to be placed before Inst, if needed.
1589 Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr,
1590 FlushVmCnt);
1591 OldWaitcntInstr = nullptr;
1592
1593 // Restore vccz if it's not known to be correct already.
1594 bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst);
1595
1596 // Don't examine operands unless we need to track vccz correctness.
1597 if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) {
1598 if (Inst.definesRegister(AMDGPU::VCC_LO) ||
1599 Inst.definesRegister(AMDGPU::VCC_HI)) {
1600 // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
1601 if (!ST->partialVCCWritesUpdateVCCZ())
1602 VCCZCorrect = false;
1603 } else if (Inst.definesRegister(AMDGPU::VCC)) {
1604 // There is a hardware bug on CI/SI where SMRD instruction may corrupt
1605 // vccz bit, so when we detect that an instruction may read from a
1606 // corrupt vccz bit, we need to:
1607 // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
1608 // operations to complete.
1609 // 2. Restore the correct value of vccz by writing the current value
1610 // of vcc back to vcc.
1611 if (ST->hasReadVCCZBug() &&
1612 ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
1613 // Writes to vcc while there's an outstanding smem read may get
1614 // clobbered as soon as any read completes.
1615 VCCZCorrect = false;
1616 } else {
1617 // Writes to vcc will fix any incorrect value in vccz.
1618 VCCZCorrect = true;
1619 }
1620 }
1621 }
1622
1623 if (TII->isSMRD(Inst)) {
1624 for (const MachineMemOperand *Memop : Inst.memoperands()) {
1625 // No need to handle invariant loads when avoiding WAR conflicts, as
1626 // there cannot be a vector store to the same memory location.
1627 if (!Memop->isInvariant()) {
1628 const Value *Ptr = Memop->getValue();
1629 SLoadAddresses.insert(std::pair(Ptr, Inst.getParent()));
1630 }
1631 }
1632 if (ST->hasReadVCCZBug()) {
1633 // This smem read could complete and clobber vccz at any time.
1634 VCCZCorrect = false;
1635 }
1636 }
1637
1638 updateEventWaitcntAfter(Inst, &ScoreBrackets);
1639
1640 #if 0 // TODO: implement resource type check controlled by options with ub = LB.
1641 // If this instruction generates a S_SETVSKIP because it is an
1642 // indexed resource, and we are on Tahiti, then it will also force
1643 // an S_WAITCNT vmcnt(0)
1644 if (RequireCheckResourceType(Inst, context)) {
1645 // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
1646 ScoreBrackets->setScoreLB(VM_CNT,
1647 ScoreBrackets->getScoreUB(VM_CNT));
1648 }
1649 #endif
1650
1651 LLVM_DEBUG({
1652 Inst.print(dbgs());
1653 ScoreBrackets.dump();
1654 });
1655
1656 // TODO: Remove this work-around after fixing the scheduler and enable the
1657 // assert above.
1658 if (RestoreVCCZ) {
1659 // Restore the vccz bit. Any time a value is written to vcc, the vcc
1660 // bit is updated, so we can restore the bit by reading the value of
1661 // vcc and then writing it back to the register.
1662 BuildMI(Block, Inst, Inst.getDebugLoc(),
1663 TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
1664 TRI->getVCC())
1665 .addReg(TRI->getVCC());
1666 VCCZCorrect = true;
1667 Modified = true;
1668 }
1669
1670 ++Iter;
1671 }
1672
1673 if (Block.getFirstTerminator() == Block.end() &&
1674 isPreheaderToFlush(Block, ScoreBrackets))
1675 Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
1676
1677 return Modified;
1678 }
1679
1680 // Return true if the given machine basic block is a preheader of a loop in
1681 // which we want to flush the vmcnt counter, and false otherwise.
isPreheaderToFlush(MachineBasicBlock & MBB,WaitcntBrackets & ScoreBrackets)1682 bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
1683 WaitcntBrackets &ScoreBrackets) {
1684 if (PreheadersToFlush.count(&MBB))
1685 return PreheadersToFlush[&MBB];
1686
1687 auto UpdateCache = [&](bool val) {
1688 PreheadersToFlush[&MBB] = val;
1689 return val;
1690 };
1691
1692 MachineBasicBlock *Succ = MBB.getSingleSuccessor();
1693 if (!Succ)
1694 return UpdateCache(false);
1695
1696 MachineLoop *Loop = MLI->getLoopFor(Succ);
1697 if (!Loop)
1698 return UpdateCache(false);
1699
1700 if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
1701 return UpdateCache(true);
1702
1703 return UpdateCache(false);
1704 }
1705
1706 // Return true if it is better to flush the vmcnt counter in the preheader of
1707 // the given loop. We currently decide to flush in two situations:
1708 // 1. The loop contains vmem store(s), no vmem load and at least one use of a
1709 // vgpr containing a value that is loaded outside of the loop. (Only on
1710 // targets with no vscnt counter).
1711 // 2. The loop contains vmem load(s), but the loaded values are not used in the
1712 // loop, and at least one use of a vgpr containing a value that is loaded
1713 // outside of the loop.
shouldFlushVmCnt(MachineLoop * ML,WaitcntBrackets & Brackets)1714 bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
1715 WaitcntBrackets &Brackets) {
1716 bool HasVMemLoad = false;
1717 bool HasVMemStore = false;
1718 bool UsesVgprLoadedOutside = false;
1719 DenseSet<Register> VgprUse;
1720 DenseSet<Register> VgprDef;
1721
1722 for (MachineBasicBlock *MBB : ML->blocks()) {
1723 for (MachineInstr &MI : *MBB) {
1724 if (SIInstrInfo::isVMEM(MI)) {
1725 if (MI.mayLoad())
1726 HasVMemLoad = true;
1727 if (MI.mayStore())
1728 HasVMemStore = true;
1729 }
1730 for (unsigned I = 0; I < MI.getNumOperands(); I++) {
1731 MachineOperand &Op = MI.getOperand(I);
1732 if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
1733 continue;
1734 RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
1735 // Vgpr use
1736 if (Op.isUse()) {
1737 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1738 // If we find a register that is loaded inside the loop, 1. and 2.
1739 // are invalidated and we can exit.
1740 if (VgprDef.contains(RegNo))
1741 return false;
1742 VgprUse.insert(RegNo);
1743 // If at least one of Op's registers is in the score brackets, the
1744 // value is likely loaded outside of the loop.
1745 if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
1746 UsesVgprLoadedOutside = true;
1747 break;
1748 }
1749 }
1750 }
1751 // VMem load vgpr def
1752 else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
1753 for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
1754 // If we find a register that is loaded inside the loop, 1. and 2.
1755 // are invalidated and we can exit.
1756 if (VgprUse.contains(RegNo))
1757 return false;
1758 VgprDef.insert(RegNo);
1759 }
1760 }
1761 }
1762 }
1763 if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside)
1764 return true;
1765 return HasVMemLoad && UsesVgprLoadedOutside;
1766 }
1767
runOnMachineFunction(MachineFunction & MF)1768 bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
1769 ST = &MF.getSubtarget<GCNSubtarget>();
1770 TII = ST->getInstrInfo();
1771 TRI = &TII->getRegisterInfo();
1772 MRI = &MF.getRegInfo();
1773 IV = AMDGPU::getIsaVersion(ST->getCPU());
1774 const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
1775 MLI = &getAnalysis<MachineLoopInfo>();
1776 PDT = &getAnalysis<MachinePostDominatorTree>();
1777
1778 ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
1779 for (auto T : inst_counter_types())
1780 ForceEmitWaitcnt[T] = false;
1781
1782 HardwareLimits Limits = {};
1783 Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
1784 Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
1785 Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
1786 Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
1787
1788 unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
1789 unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
1790 assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
1791 assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
1792
1793 RegisterEncoding Encoding = {};
1794 Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
1795 Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
1796 Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
1797 Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
1798
1799 TrackedWaitcntSet.clear();
1800 BlockInfos.clear();
1801 bool Modified = false;
1802
1803 if (!MFI->isEntryFunction()) {
1804 // Wait for any outstanding memory operations that the input registers may
1805 // depend on. We can't track them and it's better to do the wait after the
1806 // costly call sequence.
1807
1808 // TODO: Could insert earlier and schedule more liberally with operations
1809 // that only use caller preserved registers.
1810 MachineBasicBlock &EntryBB = MF.front();
1811 MachineBasicBlock::iterator I = EntryBB.begin();
1812 for (MachineBasicBlock::iterator E = EntryBB.end();
1813 I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
1814 ;
1815 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
1816 if (ST->hasVscnt())
1817 BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
1818 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
1819 .addImm(0);
1820
1821 Modified = true;
1822 }
1823
1824 // Keep iterating over the blocks in reverse post order, inserting and
1825 // updating s_waitcnt where needed, until a fix point is reached.
1826 for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
1827 BlockInfos.insert({MBB, BlockInfo(MBB)});
1828
1829 std::unique_ptr<WaitcntBrackets> Brackets;
1830 bool Repeat;
1831 do {
1832 Repeat = false;
1833
1834 for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
1835 ++BII) {
1836 BlockInfo &BI = BII->second;
1837 if (!BI.Dirty)
1838 continue;
1839
1840 if (BI.Incoming) {
1841 if (!Brackets)
1842 Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
1843 else
1844 *Brackets = *BI.Incoming;
1845 } else {
1846 if (!Brackets)
1847 Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
1848 else
1849 *Brackets = WaitcntBrackets(ST, Limits, Encoding);
1850 }
1851
1852 Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
1853 BI.Dirty = false;
1854
1855 if (Brackets->hasPendingEvent()) {
1856 BlockInfo *MoveBracketsToSucc = nullptr;
1857 for (MachineBasicBlock *Succ : BI.MBB->successors()) {
1858 auto SuccBII = BlockInfos.find(Succ);
1859 BlockInfo &SuccBI = SuccBII->second;
1860 if (!SuccBI.Incoming) {
1861 SuccBI.Dirty = true;
1862 if (SuccBII <= BII)
1863 Repeat = true;
1864 if (!MoveBracketsToSucc) {
1865 MoveBracketsToSucc = &SuccBI;
1866 } else {
1867 SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
1868 }
1869 } else if (SuccBI.Incoming->merge(*Brackets)) {
1870 SuccBI.Dirty = true;
1871 if (SuccBII <= BII)
1872 Repeat = true;
1873 }
1874 }
1875 if (MoveBracketsToSucc)
1876 MoveBracketsToSucc->Incoming = std::move(Brackets);
1877 }
1878 }
1879 } while (Repeat);
1880
1881 if (ST->hasScalarStores()) {
1882 SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
1883 bool HaveScalarStores = false;
1884
1885 for (MachineBasicBlock &MBB : MF) {
1886 for (MachineInstr &MI : MBB) {
1887 if (!HaveScalarStores && TII->isScalarStore(MI))
1888 HaveScalarStores = true;
1889
1890 if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
1891 MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
1892 EndPgmBlocks.push_back(&MBB);
1893 }
1894 }
1895
1896 if (HaveScalarStores) {
1897 // If scalar writes are used, the cache must be flushed or else the next
1898 // wave to reuse the same scratch memory can be clobbered.
1899 //
1900 // Insert s_dcache_wb at wave termination points if there were any scalar
1901 // stores, and only if the cache hasn't already been flushed. This could
1902 // be improved by looking across blocks for flushes in postdominating
1903 // blocks from the stores but an explicitly requested flush is probably
1904 // very rare.
1905 for (MachineBasicBlock *MBB : EndPgmBlocks) {
1906 bool SeenDCacheWB = false;
1907
1908 for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
1909 I != E; ++I) {
1910 if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
1911 SeenDCacheWB = true;
1912 else if (TII->isScalarStore(*I))
1913 SeenDCacheWB = false;
1914
1915 // FIXME: It would be better to insert this before a waitcnt if any.
1916 if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
1917 I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
1918 !SeenDCacheWB) {
1919 Modified = true;
1920 BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
1921 }
1922 }
1923 }
1924 }
1925 }
1926
1927 return Modified;
1928 }
1929