1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 /// S_MOV_B64 LiveMask, EXEC
25 /// S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 /// S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 /// ...
32 /// S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 /// S_OR_SAVEEXEC_B64 Tmp, -1
38 /// ...
39 /// S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 /// S_MOV_B64 Tmp, EXEC
46 /// S_WQM_B64 EXEC, EXEC
47 /// ...
48 /// S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 /// (1) at the top level (outside of control flow statements, and as long as
60 /// kill hasn't been used), one SGPR can be saved by recovering WQM from
61 /// the LiveMask (this is implemented for the entry block).
62 ///
63 /// (2) when entire regions (e.g. if-else blocks or entire loops) only
64 /// consist of exact and don't-care instructions, the switch only has to
65 /// be done at the entry and exit points rather than potentially in each
66 /// block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84
85 using namespace llvm;
86
87 #define DEBUG_TYPE "si-wqm"
88
89 namespace {
90
91 enum {
92 StateWQM = 0x1,
93 StateStrictWWM = 0x2,
94 StateStrictWQM = 0x4,
95 StateExact = 0x8,
96 StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98
99 struct PrintState {
100 public:
101 int State;
102
PrintState__anon99de34ab0111::PrintState103 explicit PrintState(int State) : State(State) {}
104 };
105
106 #ifndef NDEBUG
operator <<(raw_ostream & OS,const PrintState & PS)107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108
109 static const std::pair<char, const char *> Mapping[] = {
110 std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111 std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112 char State = PS.State;
113 for (auto M : Mapping) {
114 if (State & M.first) {
115 OS << M.second;
116 State &= ~M.first;
117
118 if (State)
119 OS << '|';
120 }
121 }
122 assert(State == 0);
123 return OS;
124 }
125 #endif
126
127 struct InstrInfo {
128 char Needs = 0;
129 char Disabled = 0;
130 char OutNeeds = 0;
131 };
132
133 struct BlockInfo {
134 char Needs = 0;
135 char InNeeds = 0;
136 char OutNeeds = 0;
137 char InitialState = 0;
138 bool NeedsLowering = false;
139 };
140
141 struct WorkItem {
142 MachineBasicBlock *MBB = nullptr;
143 MachineInstr *MI = nullptr;
144
145 WorkItem() = default;
WorkItem__anon99de34ab0111::WorkItem146 WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem__anon99de34ab0111::WorkItem147 WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152 const SIInstrInfo *TII;
153 const SIRegisterInfo *TRI;
154 const GCNSubtarget *ST;
155 MachineRegisterInfo *MRI;
156 LiveIntervals *LIS;
157 MachineDominatorTree *MDT;
158 MachinePostDominatorTree *PDT;
159
160 unsigned AndOpc;
161 unsigned AndN2Opc;
162 unsigned XorOpc;
163 unsigned AndSaveExecOpc;
164 unsigned OrSaveExecOpc;
165 unsigned WQMOpc;
166 Register Exec;
167 Register LiveMaskReg;
168
169 DenseMap<const MachineInstr *, InstrInfo> Instructions;
170 MapVector<MachineBasicBlock *, BlockInfo> Blocks;
171
172 // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
173 DenseMap<const MachineInstr *, char> StateTransition;
174
175 SmallVector<MachineInstr *, 2> LiveMaskQueries;
176 SmallVector<MachineInstr *, 4> LowerToMovInstrs;
177 SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
178 SmallVector<MachineInstr *, 4> KillInstrs;
179
180 void printInfo();
181
182 void markInstruction(MachineInstr &MI, char Flag,
183 std::vector<WorkItem> &Worklist);
184 void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
185 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
186 void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
187 std::vector<WorkItem> &Worklist);
188 void markInstructionUses(const MachineInstr &MI, char Flag,
189 std::vector<WorkItem> &Worklist);
190 char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
191 void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
192 void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
193 char analyzeFunction(MachineFunction &MF);
194
195 MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
196 MachineBasicBlock::iterator Before);
197 MachineBasicBlock::iterator
198 prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
199 MachineBasicBlock::iterator Last, bool PreferLast,
200 bool SaveSCC);
201 void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
202 Register SaveWQM);
203 void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204 Register SavedWQM);
205 void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206 Register SaveOrig, char StrictStateNeeded);
207 void fromStrictMode(MachineBasicBlock &MBB,
208 MachineBasicBlock::iterator Before, Register SavedOrig,
209 char NonStrictState, char CurrentStrictState);
210
211 MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
212
213 MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
214 bool IsWQM);
215 MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
216 void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
217 MachineInstr *Exit);
218
219 void lowerBlock(MachineBasicBlock &MBB);
220 void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221
222 void lowerLiveMaskQueries();
223 void lowerCopyInstrs();
224 void lowerKillInstrs(bool IsWQM);
225
226 public:
227 static char ID;
228
SIWholeQuadMode()229 SIWholeQuadMode() :
230 MachineFunctionPass(ID) { }
231
232 bool runOnMachineFunction(MachineFunction &MF) override;
233
getPassName() const234 StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235
getAnalysisUsage(AnalysisUsage & AU) const236 void getAnalysisUsage(AnalysisUsage &AU) const override {
237 AU.addRequired<LiveIntervals>();
238 AU.addPreserved<SlotIndexes>();
239 AU.addPreserved<LiveIntervals>();
240 AU.addRequired<MachineDominatorTree>();
241 AU.addPreserved<MachineDominatorTree>();
242 AU.addRequired<MachinePostDominatorTree>();
243 AU.addPreserved<MachinePostDominatorTree>();
244 MachineFunctionPass::getAnalysisUsage(AU);
245 }
246
getClearedProperties() const247 MachineFunctionProperties getClearedProperties() const override {
248 return MachineFunctionProperties().set(
249 MachineFunctionProperties::Property::IsSSA);
250 }
251 };
252
253 } // end anonymous namespace
254
255 char SIWholeQuadMode::ID = 0;
256
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258 false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263 false)
264
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266
createSIWholeQuadModePass()267 FunctionPass *llvm::createSIWholeQuadModePass() {
268 return new SIWholeQuadMode;
269 }
270
271 #ifndef NDEBUG
printInfo()272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273 for (const auto &BII : Blocks) {
274 dbgs() << "\n"
275 << printMBBReference(*BII.first) << ":\n"
276 << " InNeeds = " << PrintState(BII.second.InNeeds)
277 << ", Needs = " << PrintState(BII.second.Needs)
278 << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279
280 for (const MachineInstr &MI : *BII.first) {
281 auto III = Instructions.find(&MI);
282 if (III == Instructions.end())
283 continue;
284
285 dbgs() << " " << MI << " Needs = " << PrintState(III->second.Needs)
286 << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287 }
288 }
289 }
290 #endif
291
markInstruction(MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293 std::vector<WorkItem> &Worklist) {
294 InstrInfo &II = Instructions[&MI];
295
296 assert(!(Flag & StateExact) && Flag != 0);
297
298 // Remove any disabled states from the flag. The user that required it gets
299 // an undefined value in the helper lanes. For example, this can happen if
300 // the result of an atomic is used by instruction that requires WQM, where
301 // ignoring the request for WQM is correct as per the relevant specs.
302 Flag &= ~II.Disabled;
303
304 // Ignore if the flag is already encompassed by the existing needs, or we
305 // just disabled everything.
306 if ((II.Needs & Flag) == Flag)
307 return;
308
309 LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310 II.Needs |= Flag;
311 Worklist.push_back(&MI);
312 }
313
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
markDefs(const MachineInstr & UseMI,LiveRange & LR,Register Reg,unsigned SubReg,char Flag,std::vector<WorkItem> & Worklist)315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316 Register Reg, unsigned SubReg, char Flag,
317 std::vector<WorkItem> &Worklist) {
318 LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319
320 LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321 const VNInfo *Value = UseLRQ.valueIn();
322 if (!Value)
323 return;
324
325 // Note: this code assumes that lane masks on AMDGPU completely
326 // cover registers.
327 const LaneBitmask UseLanes =
328 SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329 : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330 : LaneBitmask::getNone());
331
332 // Perform a depth-first iteration of the LiveRange graph marking defs.
333 // Stop processing of a given branch when all use lanes have been defined.
334 // The first definition stops processing for a physical register.
335 struct PhiEntry {
336 const VNInfo *Phi;
337 unsigned PredIdx;
338 LaneBitmask DefinedLanes;
339
340 PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341 : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342 };
343 using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344 SmallVector<PhiEntry, 2> PhiStack;
345 SmallSet<VisitKey, 4> Visited;
346 LaneBitmask DefinedLanes;
347 unsigned NextPredIdx = 0; // Only used for processing phi nodes
348 do {
349 const VNInfo *NextValue = nullptr;
350 const VisitKey Key(Value, DefinedLanes);
351
352 if (Visited.insert(Key).second) {
353 // On first visit to a phi then start processing first predecessor
354 NextPredIdx = 0;
355 }
356
357 if (Value->isPHIDef()) {
358 // Each predecessor node in the phi must be processed as a subgraph
359 const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
360 assert(MBB && "Phi-def has no defining MBB");
361
362 // Find next predecessor to process
363 unsigned Idx = NextPredIdx;
364 auto PI = MBB->pred_begin() + Idx;
365 auto PE = MBB->pred_end();
366 for (; PI != PE && !NextValue; ++PI, ++Idx) {
367 if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
368 if (!Visited.count(VisitKey(VN, DefinedLanes)))
369 NextValue = VN;
370 }
371 }
372
373 // If there are more predecessors to process; add phi to stack
374 if (PI != PE)
375 PhiStack.emplace_back(Value, Idx, DefinedLanes);
376 } else {
377 MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
378 assert(MI && "Def has no defining instruction");
379
380 if (Reg.isVirtual()) {
381 // Iterate over all operands to find relevant definitions
382 bool HasDef = false;
383 for (const MachineOperand &Op : MI->operands()) {
384 if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
385 continue;
386
387 // Compute lanes defined and overlap with use
388 LaneBitmask OpLanes =
389 Op.isUndef() ? LaneBitmask::getAll()
390 : TRI->getSubRegIndexLaneMask(Op.getSubReg());
391 LaneBitmask Overlap = (UseLanes & OpLanes);
392
393 // Record if this instruction defined any of use
394 HasDef |= Overlap.any();
395
396 // Mark any lanes defined
397 DefinedLanes |= OpLanes;
398 }
399
400 // Check if all lanes of use have been defined
401 if ((DefinedLanes & UseLanes) != UseLanes) {
402 // Definition not complete; need to process input value
403 LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
404 if (const VNInfo *VN = LRQ.valueIn()) {
405 if (!Visited.count(VisitKey(VN, DefinedLanes)))
406 NextValue = VN;
407 }
408 }
409
410 // Only mark the instruction if it defines some part of the use
411 if (HasDef)
412 markInstruction(*MI, Flag, Worklist);
413 } else {
414 // For physical registers simply mark the defining instruction
415 markInstruction(*MI, Flag, Worklist);
416 }
417 }
418
419 if (!NextValue && !PhiStack.empty()) {
420 // Reach end of chain; revert to processing last phi
421 PhiEntry &Entry = PhiStack.back();
422 NextValue = Entry.Phi;
423 NextPredIdx = Entry.PredIdx;
424 DefinedLanes = Entry.DefinedLanes;
425 PhiStack.pop_back();
426 }
427
428 Value = NextValue;
429 } while (Value);
430 }
431
markOperand(const MachineInstr & MI,const MachineOperand & Op,char Flag,std::vector<WorkItem> & Worklist)432 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
433 const MachineOperand &Op, char Flag,
434 std::vector<WorkItem> &Worklist) {
435 assert(Op.isReg());
436 Register Reg = Op.getReg();
437
438 // Ignore some hardware registers
439 switch (Reg) {
440 case AMDGPU::EXEC:
441 case AMDGPU::EXEC_LO:
442 return;
443 default:
444 break;
445 }
446
447 LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
448 << " for " << MI);
449 if (Reg.isVirtual()) {
450 LiveRange &LR = LIS->getInterval(Reg);
451 markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
452 } else {
453 // Handle physical registers that we need to track; this is mostly relevant
454 // for VCC, which can appear as the (implicit) input of a uniform branch,
455 // e.g. when a loop counter is stored in a VGPR.
456 for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
457 ++RegUnit) {
458 LiveRange &LR = LIS->getRegUnit(*RegUnit);
459 const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
460 if (!Value)
461 continue;
462
463 markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
464 }
465 }
466 }
467
468 /// Mark all instructions defining the uses in \p MI with \p Flag.
markInstructionUses(const MachineInstr & MI,char Flag,std::vector<WorkItem> & Worklist)469 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
470 std::vector<WorkItem> &Worklist) {
471 LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
472 << MI);
473
474 for (const MachineOperand &Use : MI.uses()) {
475 if (!Use.isReg() || !Use.isUse())
476 continue;
477 markOperand(MI, Use, Flag, Worklist);
478 }
479 }
480
481 // Scan instructions to determine which ones require an Exact execmask and
482 // which ones seed WQM requirements.
scanInstructions(MachineFunction & MF,std::vector<WorkItem> & Worklist)483 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
484 std::vector<WorkItem> &Worklist) {
485 char GlobalFlags = 0;
486 bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
487 SmallVector<MachineInstr *, 4> SetInactiveInstrs;
488 SmallVector<MachineInstr *, 4> SoftWQMInstrs;
489 bool HasImplicitDerivatives =
490 MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
491
492 // We need to visit the basic blocks in reverse post-order so that we visit
493 // defs before uses, in particular so that we don't accidentally mark an
494 // instruction as needing e.g. WQM before visiting it and realizing it needs
495 // WQM disabled.
496 ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
497 for (MachineBasicBlock *MBB : RPOT) {
498 BlockInfo &BBI = Blocks[MBB];
499
500 for (MachineInstr &MI : *MBB) {
501 InstrInfo &III = Instructions[&MI];
502 unsigned Opcode = MI.getOpcode();
503 char Flags = 0;
504
505 if (TII->isWQM(Opcode)) {
506 // If LOD is not supported WQM is not needed.
507 if (!ST->hasExtendedImageInsts())
508 continue;
509 // Only generate implicit WQM if implicit derivatives are required.
510 // This avoids inserting unintended WQM if a shader type without
511 // implicit derivatives uses an image sampling instruction.
512 if (!HasImplicitDerivatives)
513 continue;
514 // Sampling instructions don't need to produce results for all pixels
515 // in a quad, they just require all inputs of a quad to have been
516 // computed for derivatives.
517 markInstructionUses(MI, StateWQM, Worklist);
518 GlobalFlags |= StateWQM;
519 continue;
520 } else if (Opcode == AMDGPU::WQM) {
521 // The WQM intrinsic requires its output to have all the helper lanes
522 // correct, so we need it to be in WQM.
523 Flags = StateWQM;
524 LowerToCopyInstrs.push_back(&MI);
525 } else if (Opcode == AMDGPU::SOFT_WQM) {
526 LowerToCopyInstrs.push_back(&MI);
527 SoftWQMInstrs.push_back(&MI);
528 continue;
529 } else if (Opcode == AMDGPU::STRICT_WWM) {
530 // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
531 // it needs to be executed in WQM or Exact so that its copy doesn't
532 // clobber inactive lanes.
533 markInstructionUses(MI, StateStrictWWM, Worklist);
534 GlobalFlags |= StateStrictWWM;
535 LowerToMovInstrs.push_back(&MI);
536 continue;
537 } else if (Opcode == AMDGPU::STRICT_WQM ||
538 TII->isDualSourceBlendEXP(MI)) {
539 // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540 // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541 // quads that have at least one active thread.
542 markInstructionUses(MI, StateStrictWQM, Worklist);
543 GlobalFlags |= StateStrictWQM;
544
545 if (Opcode == AMDGPU::STRICT_WQM) {
546 LowerToMovInstrs.push_back(&MI);
547 } else {
548 // Dual source blend export acts as implicit strict-wqm, its sources
549 // need to be shuffled in strict wqm, but the export itself needs to
550 // run in exact mode.
551 BBI.Needs |= StateExact;
552 if (!(BBI.InNeeds & StateExact)) {
553 BBI.InNeeds |= StateExact;
554 Worklist.push_back(MBB);
555 }
556 GlobalFlags |= StateExact;
557 III.Disabled = StateWQM | StateStrict;
558 }
559 continue;
560 } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
561 Opcode == AMDGPU::LDS_DIRECT_LOAD) {
562 // Mark these STRICTWQM, but only for the instruction, not its operands.
563 // This avoid unnecessarily marking M0 as requiring WQM.
564 InstrInfo &II = Instructions[&MI];
565 II.Needs |= StateStrictWQM;
566 GlobalFlags |= StateStrictWQM;
567 continue;
568 } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
569 Opcode == AMDGPU::V_SET_INACTIVE_B64) {
570 III.Disabled = StateStrict;
571 MachineOperand &Inactive = MI.getOperand(2);
572 if (Inactive.isReg()) {
573 if (Inactive.isUndef()) {
574 LowerToCopyInstrs.push_back(&MI);
575 } else {
576 markOperand(MI, Inactive, StateStrictWWM, Worklist);
577 }
578 }
579 SetInactiveInstrs.push_back(&MI);
580 continue;
581 } else if (TII->isDisableWQM(MI)) {
582 BBI.Needs |= StateExact;
583 if (!(BBI.InNeeds & StateExact)) {
584 BBI.InNeeds |= StateExact;
585 Worklist.push_back(MBB);
586 }
587 GlobalFlags |= StateExact;
588 III.Disabled = StateWQM | StateStrict;
589 continue;
590 } else {
591 if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
592 LiveMaskQueries.push_back(&MI);
593 } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
594 Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
595 Opcode == AMDGPU::SI_DEMOTE_I1) {
596 KillInstrs.push_back(&MI);
597 BBI.NeedsLowering = true;
598 } else if (WQMOutputs) {
599 // The function is in machine SSA form, which means that physical
600 // VGPRs correspond to shader inputs and outputs. Inputs are
601 // only used, outputs are only defined.
602 // FIXME: is this still valid?
603 for (const MachineOperand &MO : MI.defs()) {
604 if (!MO.isReg())
605 continue;
606
607 Register Reg = MO.getReg();
608
609 if (!Reg.isVirtual() &&
610 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
611 Flags = StateWQM;
612 break;
613 }
614 }
615 }
616
617 if (!Flags)
618 continue;
619 }
620
621 markInstruction(MI, Flags, Worklist);
622 GlobalFlags |= Flags;
623 }
624 }
625
626 // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
627 // ever used anywhere in the function. This implements the corresponding
628 // semantics of @llvm.amdgcn.set.inactive.
629 // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
630 if (GlobalFlags & StateWQM) {
631 for (MachineInstr *MI : SetInactiveInstrs)
632 markInstruction(*MI, StateWQM, Worklist);
633 for (MachineInstr *MI : SoftWQMInstrs)
634 markInstruction(*MI, StateWQM, Worklist);
635 }
636
637 return GlobalFlags;
638 }
639
propagateInstruction(MachineInstr & MI,std::vector<WorkItem> & Worklist)640 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
641 std::vector<WorkItem>& Worklist) {
642 MachineBasicBlock *MBB = MI.getParent();
643 InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
644 BlockInfo &BI = Blocks[MBB];
645
646 // Control flow-type instructions and stores to temporary memory that are
647 // followed by WQM computations must themselves be in WQM.
648 if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
649 (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
650 Instructions[&MI].Needs = StateWQM;
651 II.Needs = StateWQM;
652 }
653
654 // Propagate to block level
655 if (II.Needs & StateWQM) {
656 BI.Needs |= StateWQM;
657 if (!(BI.InNeeds & StateWQM)) {
658 BI.InNeeds |= StateWQM;
659 Worklist.push_back(MBB);
660 }
661 }
662
663 // Propagate backwards within block
664 if (MachineInstr *PrevMI = MI.getPrevNode()) {
665 char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
666 if (!PrevMI->isPHI()) {
667 InstrInfo &PrevII = Instructions[PrevMI];
668 if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
669 PrevII.OutNeeds |= InNeeds;
670 Worklist.push_back(PrevMI);
671 }
672 }
673 }
674
675 // Propagate WQM flag to instruction inputs
676 assert(!(II.Needs & StateExact));
677
678 if (II.Needs != 0)
679 markInstructionUses(MI, II.Needs, Worklist);
680
681 // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
682 // not require any WQM transitions.
683 if (II.Needs & StateStrictWWM)
684 BI.Needs |= StateStrictWWM;
685 if (II.Needs & StateStrictWQM)
686 BI.Needs |= StateStrictWQM;
687 }
688
propagateBlock(MachineBasicBlock & MBB,std::vector<WorkItem> & Worklist)689 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
690 std::vector<WorkItem>& Worklist) {
691 BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
692
693 // Propagate through instructions
694 if (!MBB.empty()) {
695 MachineInstr *LastMI = &*MBB.rbegin();
696 InstrInfo &LastII = Instructions[LastMI];
697 if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
698 LastII.OutNeeds |= BI.OutNeeds;
699 Worklist.push_back(LastMI);
700 }
701 }
702
703 // Predecessor blocks must provide for our WQM/Exact needs.
704 for (MachineBasicBlock *Pred : MBB.predecessors()) {
705 BlockInfo &PredBI = Blocks[Pred];
706 if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
707 continue;
708
709 PredBI.OutNeeds |= BI.InNeeds;
710 PredBI.InNeeds |= BI.InNeeds;
711 Worklist.push_back(Pred);
712 }
713
714 // All successors must be prepared to accept the same set of WQM/Exact data.
715 for (MachineBasicBlock *Succ : MBB.successors()) {
716 BlockInfo &SuccBI = Blocks[Succ];
717 if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
718 continue;
719
720 SuccBI.InNeeds |= BI.OutNeeds;
721 Worklist.push_back(Succ);
722 }
723 }
724
analyzeFunction(MachineFunction & MF)725 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
726 std::vector<WorkItem> Worklist;
727 char GlobalFlags = scanInstructions(MF, Worklist);
728
729 while (!Worklist.empty()) {
730 WorkItem WI = Worklist.back();
731 Worklist.pop_back();
732
733 if (WI.MI)
734 propagateInstruction(*WI.MI, Worklist);
735 else
736 propagateBlock(*WI.MBB, Worklist);
737 }
738
739 return GlobalFlags;
740 }
741
742 MachineBasicBlock::iterator
saveSCC(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before)743 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
744 MachineBasicBlock::iterator Before) {
745 Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
746
747 MachineInstr *Save =
748 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
749 .addReg(AMDGPU::SCC);
750 MachineInstr *Restore =
751 BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
752 .addReg(SaveReg);
753
754 LIS->InsertMachineInstrInMaps(*Save);
755 LIS->InsertMachineInstrInMaps(*Restore);
756 LIS->createAndComputeVirtRegInterval(SaveReg);
757
758 return Restore;
759 }
760
splitBlock(MachineBasicBlock * BB,MachineInstr * TermMI)761 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
762 MachineInstr *TermMI) {
763 LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
764 << *TermMI << "\n");
765
766 MachineBasicBlock *SplitBB =
767 BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
768
769 // Convert last instruction in block to a terminator.
770 // Note: this only covers the expected patterns
771 unsigned NewOpcode = 0;
772 switch (TermMI->getOpcode()) {
773 case AMDGPU::S_AND_B32:
774 NewOpcode = AMDGPU::S_AND_B32_term;
775 break;
776 case AMDGPU::S_AND_B64:
777 NewOpcode = AMDGPU::S_AND_B64_term;
778 break;
779 case AMDGPU::S_MOV_B32:
780 NewOpcode = AMDGPU::S_MOV_B32_term;
781 break;
782 case AMDGPU::S_MOV_B64:
783 NewOpcode = AMDGPU::S_MOV_B64_term;
784 break;
785 default:
786 break;
787 }
788 if (NewOpcode)
789 TermMI->setDesc(TII->get(NewOpcode));
790
791 if (SplitBB != BB) {
792 // Update dominator trees
793 using DomTreeT = DomTreeBase<MachineBasicBlock>;
794 SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
795 for (MachineBasicBlock *Succ : SplitBB->successors()) {
796 DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
797 DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
798 }
799 DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
800 if (MDT)
801 MDT->getBase().applyUpdates(DTUpdates);
802 if (PDT)
803 PDT->getBase().applyUpdates(DTUpdates);
804
805 // Link blocks
806 MachineInstr *MI =
807 BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
808 .addMBB(SplitBB);
809 LIS->InsertMachineInstrInMaps(*MI);
810 }
811
812 return SplitBB;
813 }
814
lowerKillF32(MachineBasicBlock & MBB,MachineInstr & MI)815 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
816 MachineInstr &MI) {
817 const DebugLoc &DL = MI.getDebugLoc();
818 unsigned Opcode = 0;
819
820 assert(MI.getOperand(0).isReg());
821
822 // Comparison is for live lanes; however here we compute the inverse
823 // (killed lanes). This is because VCMP will always generate 0 bits
824 // for inactive lanes so a mask of live lanes would not be correct
825 // inside control flow.
826 // Invert the comparison by swapping the operands and adjusting
827 // the comparison codes.
828
829 switch (MI.getOperand(2).getImm()) {
830 case ISD::SETUEQ:
831 Opcode = AMDGPU::V_CMP_LG_F32_e64;
832 break;
833 case ISD::SETUGT:
834 Opcode = AMDGPU::V_CMP_GE_F32_e64;
835 break;
836 case ISD::SETUGE:
837 Opcode = AMDGPU::V_CMP_GT_F32_e64;
838 break;
839 case ISD::SETULT:
840 Opcode = AMDGPU::V_CMP_LE_F32_e64;
841 break;
842 case ISD::SETULE:
843 Opcode = AMDGPU::V_CMP_LT_F32_e64;
844 break;
845 case ISD::SETUNE:
846 Opcode = AMDGPU::V_CMP_EQ_F32_e64;
847 break;
848 case ISD::SETO:
849 Opcode = AMDGPU::V_CMP_O_F32_e64;
850 break;
851 case ISD::SETUO:
852 Opcode = AMDGPU::V_CMP_U_F32_e64;
853 break;
854 case ISD::SETOEQ:
855 case ISD::SETEQ:
856 Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
857 break;
858 case ISD::SETOGT:
859 case ISD::SETGT:
860 Opcode = AMDGPU::V_CMP_NLT_F32_e64;
861 break;
862 case ISD::SETOGE:
863 case ISD::SETGE:
864 Opcode = AMDGPU::V_CMP_NLE_F32_e64;
865 break;
866 case ISD::SETOLT:
867 case ISD::SETLT:
868 Opcode = AMDGPU::V_CMP_NGT_F32_e64;
869 break;
870 case ISD::SETOLE:
871 case ISD::SETLE:
872 Opcode = AMDGPU::V_CMP_NGE_F32_e64;
873 break;
874 case ISD::SETONE:
875 case ISD::SETNE:
876 Opcode = AMDGPU::V_CMP_NLG_F32_e64;
877 break;
878 default:
879 llvm_unreachable("invalid ISD:SET cond code");
880 }
881
882 // Pick opcode based on comparison type.
883 MachineInstr *VcmpMI;
884 const MachineOperand &Op0 = MI.getOperand(0);
885 const MachineOperand &Op1 = MI.getOperand(1);
886
887 // VCC represents lanes killed.
888 Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
889
890 if (TRI->isVGPR(*MRI, Op0.getReg())) {
891 Opcode = AMDGPU::getVOPe32(Opcode);
892 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
893 } else {
894 VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
895 .addReg(VCC, RegState::Define)
896 .addImm(0) // src0 modifiers
897 .add(Op1)
898 .addImm(0) // src1 modifiers
899 .add(Op0)
900 .addImm(0); // omod
901 }
902
903 MachineInstr *MaskUpdateMI =
904 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
905 .addReg(LiveMaskReg)
906 .addReg(VCC);
907
908 // State of SCC represents whether any lanes are live in mask,
909 // if SCC is 0 then no lanes will be alive anymore.
910 MachineInstr *EarlyTermMI =
911 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
912
913 MachineInstr *ExecMaskMI =
914 BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
915
916 assert(MBB.succ_size() == 1);
917 MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
918 .addMBB(*MBB.succ_begin());
919
920 // Update live intervals
921 LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
922 MBB.remove(&MI);
923
924 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
925 LIS->InsertMachineInstrInMaps(*ExecMaskMI);
926 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
927 LIS->InsertMachineInstrInMaps(*NewTerm);
928
929 return NewTerm;
930 }
931
lowerKillI1(MachineBasicBlock & MBB,MachineInstr & MI,bool IsWQM)932 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
933 MachineInstr &MI, bool IsWQM) {
934 const DebugLoc &DL = MI.getDebugLoc();
935 MachineInstr *MaskUpdateMI = nullptr;
936
937 const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
938 const MachineOperand &Op = MI.getOperand(0);
939 int64_t KillVal = MI.getOperand(1).getImm();
940 MachineInstr *ComputeKilledMaskMI = nullptr;
941 Register CndReg = !Op.isImm() ? Op.getReg() : Register();
942 Register TmpReg;
943
944 // Is this a static or dynamic kill?
945 if (Op.isImm()) {
946 if (Op.getImm() == KillVal) {
947 // Static: all active lanes are killed
948 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
949 .addReg(LiveMaskReg)
950 .addReg(Exec);
951 } else {
952 // Static: kill does nothing
953 MachineInstr *NewTerm = nullptr;
954 if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
955 LIS->RemoveMachineInstrFromMaps(MI);
956 } else {
957 assert(MBB.succ_size() == 1);
958 NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
959 .addMBB(*MBB.succ_begin());
960 LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
961 }
962 MBB.remove(&MI);
963 return NewTerm;
964 }
965 } else {
966 if (!KillVal) {
967 // Op represents live lanes after kill,
968 // so exec mask needs to be factored in.
969 TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
970 ComputeKilledMaskMI =
971 BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
972 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
973 .addReg(LiveMaskReg)
974 .addReg(TmpReg);
975 } else {
976 // Op represents lanes to kill
977 MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
978 .addReg(LiveMaskReg)
979 .add(Op);
980 }
981 }
982
983 // State of SCC represents whether any lanes are live in mask,
984 // if SCC is 0 then no lanes will be alive anymore.
985 MachineInstr *EarlyTermMI =
986 BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
987
988 // In the case we got this far some lanes are still live,
989 // update EXEC to deactivate lanes as appropriate.
990 MachineInstr *NewTerm;
991 MachineInstr *WQMMaskMI = nullptr;
992 Register LiveMaskWQM;
993 if (IsDemote) {
994 // Demote - deactivate quads with only helper lanes
995 LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
996 WQMMaskMI =
997 BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
998 NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
999 .addReg(Exec)
1000 .addReg(LiveMaskWQM);
1001 } else {
1002 // Kill - deactivate lanes no longer in live mask
1003 if (Op.isImm()) {
1004 unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1005 NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1006 } else if (!IsWQM) {
1007 NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1008 .addReg(Exec)
1009 .addReg(LiveMaskReg);
1010 } else {
1011 unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1012 NewTerm =
1013 BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1014 }
1015 }
1016
1017 // Update live intervals
1018 LIS->RemoveMachineInstrFromMaps(MI);
1019 MBB.remove(&MI);
1020 assert(EarlyTermMI);
1021 assert(MaskUpdateMI);
1022 assert(NewTerm);
1023 if (ComputeKilledMaskMI)
1024 LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1025 LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1026 LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1027 if (WQMMaskMI)
1028 LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1029 LIS->InsertMachineInstrInMaps(*NewTerm);
1030
1031 if (CndReg) {
1032 LIS->removeInterval(CndReg);
1033 LIS->createAndComputeVirtRegInterval(CndReg);
1034 }
1035 if (TmpReg)
1036 LIS->createAndComputeVirtRegInterval(TmpReg);
1037 if (LiveMaskWQM)
1038 LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1039
1040 return NewTerm;
1041 }
1042
1043 // Convert a strict mode transition to a pseudo transition.
1044 // This still pre-allocates registers to prevent clobbering,
1045 // but avoids any EXEC mask changes.
lowerPseudoStrictMode(MachineBasicBlock & MBB,MachineInstr * Entry,MachineInstr * Exit)1046 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1047 MachineInstr *Entry,
1048 MachineInstr *Exit) {
1049 assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1050 assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1051
1052 Register SaveOrig = Entry->getOperand(0).getReg();
1053
1054 MachineInstr *NewEntry =
1055 BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1056 MachineInstr *NewExit =
1057 BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1058
1059 LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1060 Exit->eraseFromParent();
1061
1062 LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1063 Entry->eraseFromParent();
1064
1065 LIS->removeInterval(SaveOrig);
1066 }
1067
1068 // Replace (or supplement) instructions accessing live mask.
1069 // This can only happen once all the live mask registers have been created
1070 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
lowerBlock(MachineBasicBlock & MBB)1071 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1072 auto BII = Blocks.find(&MBB);
1073 if (BII == Blocks.end())
1074 return;
1075
1076 const BlockInfo &BI = BII->second;
1077 if (!BI.NeedsLowering)
1078 return;
1079
1080 LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1081
1082 SmallVector<MachineInstr *, 4> SplitPoints;
1083 char State = BI.InitialState;
1084 MachineInstr *StrictEntry = nullptr;
1085
1086 for (MachineInstr &MI : llvm::make_early_inc_range(
1087 llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1088 char PreviousState = State;
1089
1090 if (StateTransition.count(&MI))
1091 State = StateTransition[&MI];
1092
1093 MachineInstr *SplitPoint = nullptr;
1094 switch (MI.getOpcode()) {
1095 case AMDGPU::SI_DEMOTE_I1:
1096 case AMDGPU::SI_KILL_I1_TERMINATOR:
1097 SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1098 break;
1099 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1100 SplitPoint = lowerKillF32(MBB, MI);
1101 break;
1102 case AMDGPU::ENTER_STRICT_WQM:
1103 StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1104 break;
1105 case AMDGPU::EXIT_STRICT_WQM:
1106 if (State == StateWQM && StrictEntry) {
1107 // Transition WQM -> StrictWQM -> WQM detected.
1108 lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1109 }
1110 StrictEntry = nullptr;
1111 break;
1112 case AMDGPU::ENTER_STRICT_WWM:
1113 case AMDGPU::EXIT_STRICT_WWM:
1114 StrictEntry = nullptr;
1115 break;
1116 default:
1117 break;
1118 }
1119 if (SplitPoint)
1120 SplitPoints.push_back(SplitPoint);
1121 }
1122
1123 // Perform splitting after instruction scan to simplify iteration.
1124 if (!SplitPoints.empty()) {
1125 MachineBasicBlock *BB = &MBB;
1126 for (MachineInstr *MI : SplitPoints) {
1127 BB = splitBlock(BB, MI);
1128 }
1129 }
1130 }
1131
1132 // Return an iterator in the (inclusive) range [First, Last] at which
1133 // instructions can be safely inserted, keeping in mind that some of the
1134 // instructions we want to add necessarily clobber SCC.
prepareInsertion(MachineBasicBlock & MBB,MachineBasicBlock::iterator First,MachineBasicBlock::iterator Last,bool PreferLast,bool SaveSCC)1135 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1136 MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1137 MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1138 if (!SaveSCC)
1139 return PreferLast ? Last : First;
1140
1141 LiveRange &LR =
1142 LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1143 auto MBBE = MBB.end();
1144 SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1145 : LIS->getMBBEndIdx(&MBB);
1146 SlotIndex LastIdx =
1147 Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1148 SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1149 const LiveRange::Segment *S;
1150
1151 for (;;) {
1152 S = LR.getSegmentContaining(Idx);
1153 if (!S)
1154 break;
1155
1156 if (PreferLast) {
1157 SlotIndex Next = S->start.getBaseIndex();
1158 if (Next < FirstIdx)
1159 break;
1160 Idx = Next;
1161 } else {
1162 MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1163 assert(EndMI && "Segment does not end on valid instruction");
1164 auto NextI = std::next(EndMI->getIterator());
1165 if (NextI == MBB.end())
1166 break;
1167 SlotIndex Next = LIS->getInstructionIndex(*NextI);
1168 if (Next > LastIdx)
1169 break;
1170 Idx = Next;
1171 }
1172 }
1173
1174 MachineBasicBlock::iterator MBBI;
1175
1176 if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1177 MBBI = MI;
1178 else {
1179 assert(Idx == LIS->getMBBEndIdx(&MBB));
1180 MBBI = MBB.end();
1181 }
1182
1183 // Move insertion point past any operations modifying EXEC.
1184 // This assumes that the value of SCC defined by any of these operations
1185 // does not need to be preserved.
1186 while (MBBI != Last) {
1187 bool IsExecDef = false;
1188 for (const MachineOperand &MO : MBBI->operands()) {
1189 if (MO.isReg() && MO.isDef()) {
1190 IsExecDef |=
1191 MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1192 }
1193 }
1194 if (!IsExecDef)
1195 break;
1196 MBBI++;
1197 S = nullptr;
1198 }
1199
1200 if (S)
1201 MBBI = saveSCC(MBB, MBBI);
1202
1203 return MBBI;
1204 }
1205
toExact(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveWQM)1206 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1207 MachineBasicBlock::iterator Before,
1208 Register SaveWQM) {
1209 MachineInstr *MI;
1210
1211 if (SaveWQM) {
1212 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1213 .addReg(LiveMaskReg);
1214 } else {
1215 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1216 .addReg(Exec)
1217 .addReg(LiveMaskReg);
1218 }
1219
1220 LIS->InsertMachineInstrInMaps(*MI);
1221 StateTransition[MI] = StateExact;
1222 }
1223
toWQM(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedWQM)1224 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1225 MachineBasicBlock::iterator Before,
1226 Register SavedWQM) {
1227 MachineInstr *MI;
1228
1229 if (SavedWQM) {
1230 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1231 .addReg(SavedWQM);
1232 } else {
1233 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1234 }
1235
1236 LIS->InsertMachineInstrInMaps(*MI);
1237 StateTransition[MI] = StateWQM;
1238 }
1239
toStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SaveOrig,char StrictStateNeeded)1240 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1241 MachineBasicBlock::iterator Before,
1242 Register SaveOrig, char StrictStateNeeded) {
1243 MachineInstr *MI;
1244 assert(SaveOrig);
1245 assert(StrictStateNeeded == StateStrictWWM ||
1246 StrictStateNeeded == StateStrictWQM);
1247
1248 if (StrictStateNeeded == StateStrictWWM) {
1249 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1250 SaveOrig)
1251 .addImm(-1);
1252 } else {
1253 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1254 SaveOrig)
1255 .addImm(-1);
1256 }
1257 LIS->InsertMachineInstrInMaps(*MI);
1258 StateTransition[MI] = StrictStateNeeded;
1259
1260 // Mark block as needing lower so it will be checked for unnecessary transitions.
1261 auto BII = Blocks.find(&MBB);
1262 if (BII != Blocks.end())
1263 BII->second.NeedsLowering = true;
1264 }
1265
fromStrictMode(MachineBasicBlock & MBB,MachineBasicBlock::iterator Before,Register SavedOrig,char NonStrictState,char CurrentStrictState)1266 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1267 MachineBasicBlock::iterator Before,
1268 Register SavedOrig, char NonStrictState,
1269 char CurrentStrictState) {
1270 MachineInstr *MI;
1271
1272 assert(SavedOrig);
1273 assert(CurrentStrictState == StateStrictWWM ||
1274 CurrentStrictState == StateStrictWQM);
1275
1276 if (CurrentStrictState == StateStrictWWM) {
1277 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1278 Exec)
1279 .addReg(SavedOrig);
1280 } else {
1281 MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1282 Exec)
1283 .addReg(SavedOrig);
1284 }
1285 LIS->InsertMachineInstrInMaps(*MI);
1286 StateTransition[MI] = NonStrictState;
1287 }
1288
processBlock(MachineBasicBlock & MBB,bool IsEntry)1289 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1290 auto BII = Blocks.find(&MBB);
1291 if (BII == Blocks.end())
1292 return;
1293
1294 BlockInfo &BI = BII->second;
1295
1296 // This is a non-entry block that is WQM throughout, so no need to do
1297 // anything.
1298 if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1299 BI.InitialState = StateWQM;
1300 return;
1301 }
1302
1303 LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1304 << ":\n");
1305
1306 Register SavedWQMReg;
1307 Register SavedNonStrictReg;
1308 bool WQMFromExec = IsEntry;
1309 char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1310 char NonStrictState = 0;
1311 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1312
1313 auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1314 if (IsEntry) {
1315 // Skip the instruction that saves LiveMask
1316 if (II != IE && II->getOpcode() == AMDGPU::COPY)
1317 ++II;
1318 }
1319
1320 // This stores the first instruction where it's safe to switch from WQM to
1321 // Exact or vice versa.
1322 MachineBasicBlock::iterator FirstWQM = IE;
1323
1324 // This stores the first instruction where it's safe to switch from Strict
1325 // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1326 // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1327 // be safe to switch to/from WQM as well.
1328 MachineBasicBlock::iterator FirstStrict = IE;
1329
1330 // Record initial state is block information.
1331 BI.InitialState = State;
1332
1333 for (;;) {
1334 MachineBasicBlock::iterator Next = II;
1335 char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1336 char OutNeeds = 0;
1337
1338 if (FirstWQM == IE)
1339 FirstWQM = II;
1340
1341 if (FirstStrict == IE)
1342 FirstStrict = II;
1343
1344 // First, figure out the allowed states (Needs) based on the propagated
1345 // flags.
1346 if (II != IE) {
1347 MachineInstr &MI = *II;
1348
1349 if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1350 auto III = Instructions.find(&MI);
1351 if (III != Instructions.end()) {
1352 if (III->second.Needs & StateStrictWWM)
1353 Needs = StateStrictWWM;
1354 else if (III->second.Needs & StateStrictWQM)
1355 Needs = StateStrictWQM;
1356 else if (III->second.Needs & StateWQM)
1357 Needs = StateWQM;
1358 else
1359 Needs &= ~III->second.Disabled;
1360 OutNeeds = III->second.OutNeeds;
1361 }
1362 } else {
1363 // If the instruction doesn't actually need a correct EXEC, then we can
1364 // safely leave Strict mode enabled.
1365 Needs = StateExact | StateWQM | StateStrict;
1366 }
1367
1368 if (MI.isTerminator() && OutNeeds == StateExact)
1369 Needs = StateExact;
1370
1371 ++Next;
1372 } else {
1373 // End of basic block
1374 if (BI.OutNeeds & StateWQM)
1375 Needs = StateWQM;
1376 else if (BI.OutNeeds == StateExact)
1377 Needs = StateExact;
1378 else
1379 Needs = StateWQM | StateExact;
1380 }
1381
1382 // Now, transition if necessary.
1383 if (!(Needs & State)) {
1384 MachineBasicBlock::iterator First;
1385 if (State == StateStrictWWM || Needs == StateStrictWWM ||
1386 State == StateStrictWQM || Needs == StateStrictWQM) {
1387 // We must switch to or from Strict mode.
1388 First = FirstStrict;
1389 } else {
1390 // We only need to switch to/from WQM, so we can use FirstWQM.
1391 First = FirstWQM;
1392 }
1393
1394 // Whether we need to save SCC depends on start and end states.
1395 bool SaveSCC = false;
1396 switch (State) {
1397 case StateExact:
1398 case StateStrictWWM:
1399 case StateStrictWQM:
1400 // Exact/Strict -> Strict: save SCC
1401 // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1402 // Exact/Strict -> Exact: no save
1403 SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1404 break;
1405 case StateWQM:
1406 // WQM -> Exact/Strict: save SCC
1407 SaveSCC = !(Needs & StateWQM);
1408 break;
1409 default:
1410 llvm_unreachable("Unknown state");
1411 break;
1412 }
1413 MachineBasicBlock::iterator Before =
1414 prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1415
1416 if (State & StateStrict) {
1417 assert(State == StateStrictWWM || State == StateStrictWQM);
1418 assert(SavedNonStrictReg);
1419 fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1420
1421 LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1422 SavedNonStrictReg = 0;
1423 State = NonStrictState;
1424 }
1425
1426 if (Needs & StateStrict) {
1427 NonStrictState = State;
1428 assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1429 assert(!SavedNonStrictReg);
1430 SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1431
1432 toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1433 State = Needs;
1434
1435 } else {
1436 if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1437 if (!WQMFromExec && (OutNeeds & StateWQM)) {
1438 assert(!SavedWQMReg);
1439 SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1440 }
1441
1442 toExact(MBB, Before, SavedWQMReg);
1443 State = StateExact;
1444 } else if (State == StateExact && (Needs & StateWQM) &&
1445 !(Needs & StateExact)) {
1446 assert(WQMFromExec == (SavedWQMReg == 0));
1447
1448 toWQM(MBB, Before, SavedWQMReg);
1449
1450 if (SavedWQMReg) {
1451 LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1452 SavedWQMReg = 0;
1453 }
1454 State = StateWQM;
1455 } else {
1456 // We can get here if we transitioned from StrictWWM to a
1457 // non-StrictWWM state that already matches our needs, but we
1458 // shouldn't need to do anything.
1459 assert(Needs & State);
1460 }
1461 }
1462 }
1463
1464 if (Needs != (StateExact | StateWQM | StateStrict)) {
1465 if (Needs != (StateExact | StateWQM))
1466 FirstWQM = IE;
1467 FirstStrict = IE;
1468 }
1469
1470 if (II == IE)
1471 break;
1472
1473 II = Next;
1474 }
1475 assert(!SavedWQMReg);
1476 assert(!SavedNonStrictReg);
1477 }
1478
lowerLiveMaskQueries()1479 void SIWholeQuadMode::lowerLiveMaskQueries() {
1480 for (MachineInstr *MI : LiveMaskQueries) {
1481 const DebugLoc &DL = MI->getDebugLoc();
1482 Register Dest = MI->getOperand(0).getReg();
1483
1484 MachineInstr *Copy =
1485 BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1486 .addReg(LiveMaskReg);
1487
1488 LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1489 MI->eraseFromParent();
1490 }
1491 }
1492
lowerCopyInstrs()1493 void SIWholeQuadMode::lowerCopyInstrs() {
1494 for (MachineInstr *MI : LowerToMovInstrs) {
1495 assert(MI->getNumExplicitOperands() == 2);
1496
1497 const Register Reg = MI->getOperand(0).getReg();
1498
1499 const TargetRegisterClass *regClass =
1500 TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1501 if (TRI->isVGPRClass(regClass)) {
1502 const unsigned MovOp = TII->getMovOpcode(regClass);
1503 MI->setDesc(TII->get(MovOp));
1504
1505 // Check that it already implicitly depends on exec (like all VALU movs
1506 // should do).
1507 assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1508 return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1509 }));
1510 } else {
1511 // Remove early-clobber and exec dependency from simple SGPR copies.
1512 // This allows some to be eliminated during/post RA.
1513 LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1514 if (MI->getOperand(0).isEarlyClobber()) {
1515 LIS->removeInterval(Reg);
1516 MI->getOperand(0).setIsEarlyClobber(false);
1517 LIS->createAndComputeVirtRegInterval(Reg);
1518 }
1519 int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1520 while (Index >= 0) {
1521 MI->removeOperand(Index);
1522 Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1523 }
1524 MI->setDesc(TII->get(AMDGPU::COPY));
1525 LLVM_DEBUG(dbgs() << " -> " << *MI);
1526 }
1527 }
1528 for (MachineInstr *MI : LowerToCopyInstrs) {
1529 if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1530 MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1531 assert(MI->getNumExplicitOperands() == 3);
1532 // the only reason we should be here is V_SET_INACTIVE has
1533 // an undef input so it is being replaced by a simple copy.
1534 // There should be a second undef source that we should remove.
1535 assert(MI->getOperand(2).isUndef());
1536 MI->removeOperand(2);
1537 MI->untieRegOperand(1);
1538 } else {
1539 assert(MI->getNumExplicitOperands() == 2);
1540 }
1541
1542 MI->setDesc(TII->get(AMDGPU::COPY));
1543 }
1544 }
1545
lowerKillInstrs(bool IsWQM)1546 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1547 for (MachineInstr *MI : KillInstrs) {
1548 MachineBasicBlock *MBB = MI->getParent();
1549 MachineInstr *SplitPoint = nullptr;
1550 switch (MI->getOpcode()) {
1551 case AMDGPU::SI_DEMOTE_I1:
1552 case AMDGPU::SI_KILL_I1_TERMINATOR:
1553 SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1554 break;
1555 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1556 SplitPoint = lowerKillF32(*MBB, *MI);
1557 break;
1558 default:
1559 continue;
1560 }
1561 if (SplitPoint)
1562 splitBlock(MBB, SplitPoint);
1563 }
1564 }
1565
runOnMachineFunction(MachineFunction & MF)1566 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1567 LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1568 << " ------------- \n");
1569 LLVM_DEBUG(MF.dump(););
1570
1571 Instructions.clear();
1572 Blocks.clear();
1573 LiveMaskQueries.clear();
1574 LowerToCopyInstrs.clear();
1575 LowerToMovInstrs.clear();
1576 KillInstrs.clear();
1577 StateTransition.clear();
1578
1579 ST = &MF.getSubtarget<GCNSubtarget>();
1580
1581 TII = ST->getInstrInfo();
1582 TRI = &TII->getRegisterInfo();
1583 MRI = &MF.getRegInfo();
1584 LIS = &getAnalysis<LiveIntervals>();
1585 MDT = &getAnalysis<MachineDominatorTree>();
1586 PDT = &getAnalysis<MachinePostDominatorTree>();
1587
1588 if (ST->isWave32()) {
1589 AndOpc = AMDGPU::S_AND_B32;
1590 AndN2Opc = AMDGPU::S_ANDN2_B32;
1591 XorOpc = AMDGPU::S_XOR_B32;
1592 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1593 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1594 WQMOpc = AMDGPU::S_WQM_B32;
1595 Exec = AMDGPU::EXEC_LO;
1596 } else {
1597 AndOpc = AMDGPU::S_AND_B64;
1598 AndN2Opc = AMDGPU::S_ANDN2_B64;
1599 XorOpc = AMDGPU::S_XOR_B64;
1600 AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1601 OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1602 WQMOpc = AMDGPU::S_WQM_B64;
1603 Exec = AMDGPU::EXEC;
1604 }
1605
1606 const char GlobalFlags = analyzeFunction(MF);
1607 const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1608
1609 LiveMaskReg = Exec;
1610
1611 // Shader is simple does not need any state changes or any complex lowering
1612 if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1613 LowerToMovInstrs.empty() && KillInstrs.empty()) {
1614 lowerLiveMaskQueries();
1615 return !LiveMaskQueries.empty();
1616 }
1617
1618 MachineBasicBlock &Entry = MF.front();
1619 MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1620
1621 // Store a copy of the original live mask when required
1622 if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1623 LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1624 MachineInstr *MI =
1625 BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1626 .addReg(Exec);
1627 LIS->InsertMachineInstrInMaps(*MI);
1628 }
1629
1630 LLVM_DEBUG(printInfo());
1631
1632 lowerLiveMaskQueries();
1633 lowerCopyInstrs();
1634
1635 // Shader only needs WQM
1636 if (GlobalFlags == StateWQM) {
1637 auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1638 .addReg(Exec);
1639 LIS->InsertMachineInstrInMaps(*MI);
1640 lowerKillInstrs(true);
1641 } else {
1642 for (auto BII : Blocks)
1643 processBlock(*BII.first, BII.first == &Entry);
1644 // Lowering blocks causes block splitting so perform as a second pass.
1645 for (auto BII : Blocks)
1646 lowerBlock(*BII.first);
1647 }
1648
1649 // Compute live range for live mask
1650 if (LiveMaskReg != Exec)
1651 LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1652
1653 // Physical registers like SCC aren't tracked by default anyway, so just
1654 // removing the ranges we computed is the simplest option for maintaining
1655 // the analysis results.
1656 LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1657
1658 // If we performed any kills then recompute EXEC
1659 if (!KillInstrs.empty())
1660 LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1661
1662 return true;
1663 }
1664