1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::pair(StateWQM, "WQM"), std::pair(StateStrictWWM, "StrictWWM"),
111       std::pair(StateStrictWQM, "StrictWQM"), std::pair(StateExact, "Exact")};
112   char State = PS.State;
113   for (auto M : Mapping) {
114     if (State & M.first) {
115       OS << M.second;
116       State &= ~M.first;
117 
118       if (State)
119         OS << '|';
120     }
121   }
122   assert(State == 0);
123   return OS;
124 }
125 #endif
126 
127 struct InstrInfo {
128   char Needs = 0;
129   char Disabled = 0;
130   char OutNeeds = 0;
131 };
132 
133 struct BlockInfo {
134   char Needs = 0;
135   char InNeeds = 0;
136   char OutNeeds = 0;
137   char InitialState = 0;
138   bool NeedsLowering = false;
139 };
140 
141 struct WorkItem {
142   MachineBasicBlock *MBB = nullptr;
143   MachineInstr *MI = nullptr;
144 
145   WorkItem() = default;
146   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
147   WorkItem(MachineInstr *MI) : MI(MI) {}
148 };
149 
150 class SIWholeQuadMode : public MachineFunctionPass {
151 private:
152   const SIInstrInfo *TII;
153   const SIRegisterInfo *TRI;
154   const GCNSubtarget *ST;
155   MachineRegisterInfo *MRI;
156   LiveIntervals *LIS;
157   MachineDominatorTree *MDT;
158   MachinePostDominatorTree *PDT;
159 
160   unsigned AndOpc;
161   unsigned AndTermOpc;
162   unsigned AndN2Opc;
163   unsigned XorOpc;
164   unsigned AndSaveExecOpc;
165   unsigned AndSaveExecTermOpc;
166   unsigned WQMOpc;
167   Register Exec;
168   Register LiveMaskReg;
169 
170   DenseMap<const MachineInstr *, InstrInfo> Instructions;
171   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
172 
173   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
174   DenseMap<const MachineInstr *, char> StateTransition;
175 
176   SmallVector<MachineInstr *, 2> LiveMaskQueries;
177   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
178   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
179   SmallVector<MachineInstr *, 4> KillInstrs;
180 
181   void printInfo();
182 
183   void markInstruction(MachineInstr &MI, char Flag,
184                        std::vector<WorkItem> &Worklist);
185   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
186                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
187   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
188                    std::vector<WorkItem> &Worklist);
189   void markInstructionUses(const MachineInstr &MI, char Flag,
190                            std::vector<WorkItem> &Worklist);
191   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
192   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
193   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
194   char analyzeFunction(MachineFunction &MF);
195 
196   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
197                                       MachineBasicBlock::iterator Before);
198   MachineBasicBlock::iterator
199   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
200                    MachineBasicBlock::iterator Last, bool PreferLast,
201                    bool SaveSCC);
202   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
203                Register SaveWQM);
204   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
205              Register SavedWQM);
206   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
207                     Register SaveOrig, char StrictStateNeeded);
208   void fromStrictMode(MachineBasicBlock &MBB,
209                       MachineBasicBlock::iterator Before, Register SavedOrig,
210                       char NonStrictState, char CurrentStrictState);
211 
212   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
213 
214   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
215                             bool IsWQM);
216   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
217   void lowerPseudoStrictMode(MachineBasicBlock &MBB, MachineInstr *Entry,
218                              MachineInstr *Exit);
219 
220   void lowerBlock(MachineBasicBlock &MBB);
221   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
222 
223   void lowerLiveMaskQueries();
224   void lowerCopyInstrs();
225   void lowerKillInstrs(bool IsWQM);
226 
227 public:
228   static char ID;
229 
230   SIWholeQuadMode() :
231     MachineFunctionPass(ID) { }
232 
233   bool runOnMachineFunction(MachineFunction &MF) override;
234 
235   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
236 
237   void getAnalysisUsage(AnalysisUsage &AU) const override {
238     AU.addRequired<LiveIntervals>();
239     AU.addPreserved<SlotIndexes>();
240     AU.addPreserved<LiveIntervals>();
241     AU.addPreserved<MachineDominatorTree>();
242     AU.addPreserved<MachinePostDominatorTree>();
243     MachineFunctionPass::getAnalysisUsage(AU);
244   }
245 
246   MachineFunctionProperties getClearedProperties() const override {
247     return MachineFunctionProperties().set(
248         MachineFunctionProperties::Property::IsSSA);
249   }
250 };
251 
252 } // end anonymous namespace
253 
254 char SIWholeQuadMode::ID = 0;
255 
256 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
257                       false)
258 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
259 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
260 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
261 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
262                     false)
263 
264 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
265 
266 FunctionPass *llvm::createSIWholeQuadModePass() {
267   return new SIWholeQuadMode;
268 }
269 
270 #ifndef NDEBUG
271 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
272   for (const auto &BII : Blocks) {
273     dbgs() << "\n"
274            << printMBBReference(*BII.first) << ":\n"
275            << "  InNeeds = " << PrintState(BII.second.InNeeds)
276            << ", Needs = " << PrintState(BII.second.Needs)
277            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
278 
279     for (const MachineInstr &MI : *BII.first) {
280       auto III = Instructions.find(&MI);
281       if (III == Instructions.end())
282         continue;
283 
284       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
285              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
286     }
287   }
288 }
289 #endif
290 
291 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
292                                       std::vector<WorkItem> &Worklist) {
293   InstrInfo &II = Instructions[&MI];
294 
295   assert(!(Flag & StateExact) && Flag != 0);
296 
297   // Remove any disabled states from the flag. The user that required it gets
298   // an undefined value in the helper lanes. For example, this can happen if
299   // the result of an atomic is used by instruction that requires WQM, where
300   // ignoring the request for WQM is correct as per the relevant specs.
301   Flag &= ~II.Disabled;
302 
303   // Ignore if the flag is already encompassed by the existing needs, or we
304   // just disabled everything.
305   if ((II.Needs & Flag) == Flag)
306     return;
307 
308   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
309   II.Needs |= Flag;
310   Worklist.push_back(&MI);
311 }
312 
313 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
314 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
315                                Register Reg, unsigned SubReg, char Flag,
316                                std::vector<WorkItem> &Worklist) {
317   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
318 
319   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
320   const VNInfo *Value = UseLRQ.valueIn();
321   if (!Value)
322     return;
323 
324   // Note: this code assumes that lane masks on AMDGPU completely
325   // cover registers.
326   const LaneBitmask UseLanes =
327       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
328              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
329                                 : LaneBitmask::getNone());
330 
331   // Perform a depth-first iteration of the LiveRange graph marking defs.
332   // Stop processing of a given branch when all use lanes have been defined.
333   // The first definition stops processing for a physical register.
334   struct PhiEntry {
335     const VNInfo *Phi;
336     unsigned PredIdx;
337     LaneBitmask DefinedLanes;
338 
339     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
340         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
341   };
342   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
343   SmallVector<PhiEntry, 2> PhiStack;
344   SmallSet<VisitKey, 4> Visited;
345   LaneBitmask DefinedLanes;
346   unsigned NextPredIdx = 0; // Only used for processing phi nodes
347   do {
348     const VNInfo *NextValue = nullptr;
349     const VisitKey Key(Value, DefinedLanes);
350 
351     if (Visited.insert(Key).second) {
352       // On first visit to a phi then start processing first predecessor
353       NextPredIdx = 0;
354     }
355 
356     if (Value->isPHIDef()) {
357       // Each predecessor node in the phi must be processed as a subgraph
358       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
359       assert(MBB && "Phi-def has no defining MBB");
360 
361       // Find next predecessor to process
362       unsigned Idx = NextPredIdx;
363       auto PI = MBB->pred_begin() + Idx;
364       auto PE = MBB->pred_end();
365       for (; PI != PE && !NextValue; ++PI, ++Idx) {
366         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
367           if (!Visited.count(VisitKey(VN, DefinedLanes)))
368             NextValue = VN;
369         }
370       }
371 
372       // If there are more predecessors to process; add phi to stack
373       if (PI != PE)
374         PhiStack.emplace_back(Value, Idx, DefinedLanes);
375     } else {
376       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
377       assert(MI && "Def has no defining instruction");
378 
379       if (Reg.isVirtual()) {
380         // Iterate over all operands to find relevant definitions
381         bool HasDef = false;
382         for (const MachineOperand &Op : MI->all_defs()) {
383           if (Op.getReg() != Reg)
384             continue;
385 
386           // Compute lanes defined and overlap with use
387           LaneBitmask OpLanes =
388               Op.isUndef() ? LaneBitmask::getAll()
389                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
390           LaneBitmask Overlap = (UseLanes & OpLanes);
391 
392           // Record if this instruction defined any of use
393           HasDef |= Overlap.any();
394 
395           // Mark any lanes defined
396           DefinedLanes |= OpLanes;
397         }
398 
399         // Check if all lanes of use have been defined
400         if ((DefinedLanes & UseLanes) != UseLanes) {
401           // Definition not complete; need to process input value
402           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
403           if (const VNInfo *VN = LRQ.valueIn()) {
404             if (!Visited.count(VisitKey(VN, DefinedLanes)))
405               NextValue = VN;
406           }
407         }
408 
409         // Only mark the instruction if it defines some part of the use
410         if (HasDef)
411           markInstruction(*MI, Flag, Worklist);
412       } else {
413         // For physical registers simply mark the defining instruction
414         markInstruction(*MI, Flag, Worklist);
415       }
416     }
417 
418     if (!NextValue && !PhiStack.empty()) {
419       // Reach end of chain; revert to processing last phi
420       PhiEntry &Entry = PhiStack.back();
421       NextValue = Entry.Phi;
422       NextPredIdx = Entry.PredIdx;
423       DefinedLanes = Entry.DefinedLanes;
424       PhiStack.pop_back();
425     }
426 
427     Value = NextValue;
428   } while (Value);
429 }
430 
431 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
432                                   const MachineOperand &Op, char Flag,
433                                   std::vector<WorkItem> &Worklist) {
434   assert(Op.isReg());
435   Register Reg = Op.getReg();
436 
437   // Ignore some hardware registers
438   switch (Reg) {
439   case AMDGPU::EXEC:
440   case AMDGPU::EXEC_LO:
441     return;
442   default:
443     break;
444   }
445 
446   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
447                     << " for " << MI);
448   if (Reg.isVirtual()) {
449     LiveRange &LR = LIS->getInterval(Reg);
450     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
451   } else {
452     // Handle physical registers that we need to track; this is mostly relevant
453     // for VCC, which can appear as the (implicit) input of a uniform branch,
454     // e.g. when a loop counter is stored in a VGPR.
455     for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
456       LiveRange &LR = LIS->getRegUnit(Unit);
457       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
458       if (!Value)
459         continue;
460 
461       markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
462     }
463   }
464 }
465 
466 /// Mark all instructions defining the uses in \p MI with \p Flag.
467 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
468                                           std::vector<WorkItem> &Worklist) {
469   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
470                     << MI);
471 
472   for (const MachineOperand &Use : MI.all_uses())
473     markOperand(MI, Use, Flag, Worklist);
474 }
475 
476 // Scan instructions to determine which ones require an Exact execmask and
477 // which ones seed WQM requirements.
478 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
479                                        std::vector<WorkItem> &Worklist) {
480   char GlobalFlags = 0;
481   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
482   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
483   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
484   bool HasImplicitDerivatives =
485       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
486 
487   // We need to visit the basic blocks in reverse post-order so that we visit
488   // defs before uses, in particular so that we don't accidentally mark an
489   // instruction as needing e.g. WQM before visiting it and realizing it needs
490   // WQM disabled.
491   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
492   for (MachineBasicBlock *MBB : RPOT) {
493     BlockInfo &BBI = Blocks[MBB];
494 
495     for (MachineInstr &MI : *MBB) {
496       InstrInfo &III = Instructions[&MI];
497       unsigned Opcode = MI.getOpcode();
498       char Flags = 0;
499 
500       if (TII->isWQM(Opcode)) {
501         // If LOD is not supported WQM is not needed.
502         if (!ST->hasExtendedImageInsts())
503           continue;
504         // Only generate implicit WQM if implicit derivatives are required.
505         // This avoids inserting unintended WQM if a shader type without
506         // implicit derivatives uses an image sampling instruction.
507         if (!HasImplicitDerivatives)
508           continue;
509         // Sampling instructions don't need to produce results for all pixels
510         // in a quad, they just require all inputs of a quad to have been
511         // computed for derivatives.
512         markInstructionUses(MI, StateWQM, Worklist);
513         GlobalFlags |= StateWQM;
514         continue;
515       } else if (Opcode == AMDGPU::WQM) {
516         // The WQM intrinsic requires its output to have all the helper lanes
517         // correct, so we need it to be in WQM.
518         Flags = StateWQM;
519         LowerToCopyInstrs.push_back(&MI);
520       } else if (Opcode == AMDGPU::SOFT_WQM) {
521         LowerToCopyInstrs.push_back(&MI);
522         SoftWQMInstrs.push_back(&MI);
523         continue;
524       } else if (Opcode == AMDGPU::STRICT_WWM) {
525         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
526         // it needs to be executed in WQM or Exact so that its copy doesn't
527         // clobber inactive lanes.
528         markInstructionUses(MI, StateStrictWWM, Worklist);
529         GlobalFlags |= StateStrictWWM;
530         LowerToMovInstrs.push_back(&MI);
531         continue;
532       } else if (Opcode == AMDGPU::STRICT_WQM ||
533                  TII->isDualSourceBlendEXP(MI)) {
534         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
535         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
536         // quads that have at least one active thread.
537         markInstructionUses(MI, StateStrictWQM, Worklist);
538         GlobalFlags |= StateStrictWQM;
539 
540         if (Opcode == AMDGPU::STRICT_WQM) {
541           LowerToMovInstrs.push_back(&MI);
542         } else {
543           // Dual source blend export acts as implicit strict-wqm, its sources
544           // need to be shuffled in strict wqm, but the export itself needs to
545           // run in exact mode.
546           BBI.Needs |= StateExact;
547           if (!(BBI.InNeeds & StateExact)) {
548             BBI.InNeeds |= StateExact;
549             Worklist.push_back(MBB);
550           }
551           GlobalFlags |= StateExact;
552           III.Disabled = StateWQM | StateStrict;
553         }
554         continue;
555       } else if (Opcode == AMDGPU::LDS_PARAM_LOAD ||
556                  Opcode == AMDGPU::LDS_DIRECT_LOAD) {
557         // Mark these STRICTWQM, but only for the instruction, not its operands.
558         // This avoid unnecessarily marking M0 as requiring WQM.
559         InstrInfo &II = Instructions[&MI];
560         II.Needs |= StateStrictWQM;
561         GlobalFlags |= StateStrictWQM;
562         continue;
563       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
564                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
565         III.Disabled = StateStrict;
566         MachineOperand &Inactive = MI.getOperand(2);
567         if (Inactive.isReg()) {
568           if (Inactive.isUndef()) {
569             LowerToCopyInstrs.push_back(&MI);
570           } else {
571             markOperand(MI, Inactive, StateStrictWWM, Worklist);
572           }
573         }
574         SetInactiveInstrs.push_back(&MI);
575         continue;
576       } else if (TII->isDisableWQM(MI)) {
577         BBI.Needs |= StateExact;
578         if (!(BBI.InNeeds & StateExact)) {
579           BBI.InNeeds |= StateExact;
580           Worklist.push_back(MBB);
581         }
582         GlobalFlags |= StateExact;
583         III.Disabled = StateWQM | StateStrict;
584         continue;
585       } else {
586         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
587           LiveMaskQueries.push_back(&MI);
588         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
589                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
590                    Opcode == AMDGPU::SI_DEMOTE_I1) {
591           KillInstrs.push_back(&MI);
592           BBI.NeedsLowering = true;
593         } else if (WQMOutputs) {
594           // The function is in machine SSA form, which means that physical
595           // VGPRs correspond to shader inputs and outputs. Inputs are
596           // only used, outputs are only defined.
597           // FIXME: is this still valid?
598           for (const MachineOperand &MO : MI.defs()) {
599             if (!MO.isReg())
600               continue;
601 
602             Register Reg = MO.getReg();
603 
604             if (!Reg.isVirtual() &&
605                 TRI->hasVectorRegisters(TRI->getPhysRegBaseClass(Reg))) {
606               Flags = StateWQM;
607               break;
608             }
609           }
610         }
611 
612         if (!Flags)
613           continue;
614       }
615 
616       markInstruction(MI, Flags, Worklist);
617       GlobalFlags |= Flags;
618     }
619   }
620 
621   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
622   // ever used anywhere in the function. This implements the corresponding
623   // semantics of @llvm.amdgcn.set.inactive.
624   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
625   if (GlobalFlags & StateWQM) {
626     for (MachineInstr *MI : SetInactiveInstrs)
627       markInstruction(*MI, StateWQM, Worklist);
628     for (MachineInstr *MI : SoftWQMInstrs)
629       markInstruction(*MI, StateWQM, Worklist);
630   }
631 
632   return GlobalFlags;
633 }
634 
635 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
636                                            std::vector<WorkItem>& Worklist) {
637   MachineBasicBlock *MBB = MI.getParent();
638   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
639   BlockInfo &BI = Blocks[MBB];
640 
641   // Control flow-type instructions and stores to temporary memory that are
642   // followed by WQM computations must themselves be in WQM.
643   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
644       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
645     Instructions[&MI].Needs = StateWQM;
646     II.Needs = StateWQM;
647   }
648 
649   // Propagate to block level
650   if (II.Needs & StateWQM) {
651     BI.Needs |= StateWQM;
652     if (!(BI.InNeeds & StateWQM)) {
653       BI.InNeeds |= StateWQM;
654       Worklist.push_back(MBB);
655     }
656   }
657 
658   // Propagate backwards within block
659   if (MachineInstr *PrevMI = MI.getPrevNode()) {
660     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
661     if (!PrevMI->isPHI()) {
662       InstrInfo &PrevII = Instructions[PrevMI];
663       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
664         PrevII.OutNeeds |= InNeeds;
665         Worklist.push_back(PrevMI);
666       }
667     }
668   }
669 
670   // Propagate WQM flag to instruction inputs
671   assert(!(II.Needs & StateExact));
672 
673   if (II.Needs != 0)
674     markInstructionUses(MI, II.Needs, Worklist);
675 
676   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
677   // not require any WQM transitions.
678   if (II.Needs & StateStrictWWM)
679     BI.Needs |= StateStrictWWM;
680   if (II.Needs & StateStrictWQM)
681     BI.Needs |= StateStrictWQM;
682 }
683 
684 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
685                                      std::vector<WorkItem>& Worklist) {
686   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
687 
688   // Propagate through instructions
689   if (!MBB.empty()) {
690     MachineInstr *LastMI = &*MBB.rbegin();
691     InstrInfo &LastII = Instructions[LastMI];
692     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
693       LastII.OutNeeds |= BI.OutNeeds;
694       Worklist.push_back(LastMI);
695     }
696   }
697 
698   // Predecessor blocks must provide for our WQM/Exact needs.
699   for (MachineBasicBlock *Pred : MBB.predecessors()) {
700     BlockInfo &PredBI = Blocks[Pred];
701     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
702       continue;
703 
704     PredBI.OutNeeds |= BI.InNeeds;
705     PredBI.InNeeds |= BI.InNeeds;
706     Worklist.push_back(Pred);
707   }
708 
709   // All successors must be prepared to accept the same set of WQM/Exact data.
710   for (MachineBasicBlock *Succ : MBB.successors()) {
711     BlockInfo &SuccBI = Blocks[Succ];
712     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
713       continue;
714 
715     SuccBI.InNeeds |= BI.OutNeeds;
716     Worklist.push_back(Succ);
717   }
718 }
719 
720 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
721   std::vector<WorkItem> Worklist;
722   char GlobalFlags = scanInstructions(MF, Worklist);
723 
724   while (!Worklist.empty()) {
725     WorkItem WI = Worklist.back();
726     Worklist.pop_back();
727 
728     if (WI.MI)
729       propagateInstruction(*WI.MI, Worklist);
730     else
731       propagateBlock(*WI.MBB, Worklist);
732   }
733 
734   return GlobalFlags;
735 }
736 
737 MachineBasicBlock::iterator
738 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
739                          MachineBasicBlock::iterator Before) {
740   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
741 
742   MachineInstr *Save =
743       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
744           .addReg(AMDGPU::SCC);
745   MachineInstr *Restore =
746       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
747           .addReg(SaveReg);
748 
749   LIS->InsertMachineInstrInMaps(*Save);
750   LIS->InsertMachineInstrInMaps(*Restore);
751   LIS->createAndComputeVirtRegInterval(SaveReg);
752 
753   return Restore;
754 }
755 
756 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
757                                                MachineInstr *TermMI) {
758   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
759                     << *TermMI << "\n");
760 
761   MachineBasicBlock *SplitBB =
762       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
763 
764   // Convert last instruction in block to a terminator.
765   // Note: this only covers the expected patterns
766   unsigned NewOpcode = 0;
767   switch (TermMI->getOpcode()) {
768   case AMDGPU::S_AND_B32:
769     NewOpcode = AMDGPU::S_AND_B32_term;
770     break;
771   case AMDGPU::S_AND_B64:
772     NewOpcode = AMDGPU::S_AND_B64_term;
773     break;
774   case AMDGPU::S_MOV_B32:
775     NewOpcode = AMDGPU::S_MOV_B32_term;
776     break;
777   case AMDGPU::S_MOV_B64:
778     NewOpcode = AMDGPU::S_MOV_B64_term;
779     break;
780   default:
781     break;
782   }
783   if (NewOpcode)
784     TermMI->setDesc(TII->get(NewOpcode));
785 
786   if (SplitBB != BB) {
787     // Update dominator trees
788     using DomTreeT = DomTreeBase<MachineBasicBlock>;
789     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
790     for (MachineBasicBlock *Succ : SplitBB->successors()) {
791       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
792       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
793     }
794     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
795     if (MDT)
796       MDT->getBase().applyUpdates(DTUpdates);
797     if (PDT)
798       PDT->getBase().applyUpdates(DTUpdates);
799 
800     // Link blocks
801     MachineInstr *MI =
802         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
803             .addMBB(SplitBB);
804     LIS->InsertMachineInstrInMaps(*MI);
805   }
806 
807   return SplitBB;
808 }
809 
810 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
811                                             MachineInstr &MI) {
812   const DebugLoc &DL = MI.getDebugLoc();
813   unsigned Opcode = 0;
814 
815   assert(MI.getOperand(0).isReg());
816 
817   // Comparison is for live lanes; however here we compute the inverse
818   // (killed lanes).  This is because VCMP will always generate 0 bits
819   // for inactive lanes so a mask of live lanes would not be correct
820   // inside control flow.
821   // Invert the comparison by swapping the operands and adjusting
822   // the comparison codes.
823 
824   switch (MI.getOperand(2).getImm()) {
825   case ISD::SETUEQ:
826     Opcode = AMDGPU::V_CMP_LG_F32_e64;
827     break;
828   case ISD::SETUGT:
829     Opcode = AMDGPU::V_CMP_GE_F32_e64;
830     break;
831   case ISD::SETUGE:
832     Opcode = AMDGPU::V_CMP_GT_F32_e64;
833     break;
834   case ISD::SETULT:
835     Opcode = AMDGPU::V_CMP_LE_F32_e64;
836     break;
837   case ISD::SETULE:
838     Opcode = AMDGPU::V_CMP_LT_F32_e64;
839     break;
840   case ISD::SETUNE:
841     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
842     break;
843   case ISD::SETO:
844     Opcode = AMDGPU::V_CMP_O_F32_e64;
845     break;
846   case ISD::SETUO:
847     Opcode = AMDGPU::V_CMP_U_F32_e64;
848     break;
849   case ISD::SETOEQ:
850   case ISD::SETEQ:
851     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
852     break;
853   case ISD::SETOGT:
854   case ISD::SETGT:
855     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
856     break;
857   case ISD::SETOGE:
858   case ISD::SETGE:
859     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
860     break;
861   case ISD::SETOLT:
862   case ISD::SETLT:
863     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
864     break;
865   case ISD::SETOLE:
866   case ISD::SETLE:
867     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
868     break;
869   case ISD::SETONE:
870   case ISD::SETNE:
871     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
872     break;
873   default:
874     llvm_unreachable("invalid ISD:SET cond code");
875   }
876 
877   // Pick opcode based on comparison type.
878   MachineInstr *VcmpMI;
879   const MachineOperand &Op0 = MI.getOperand(0);
880   const MachineOperand &Op1 = MI.getOperand(1);
881 
882   // VCC represents lanes killed.
883   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
884 
885   if (TRI->isVGPR(*MRI, Op0.getReg())) {
886     Opcode = AMDGPU::getVOPe32(Opcode);
887     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
888   } else {
889     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
890                  .addReg(VCC, RegState::Define)
891                  .addImm(0) // src0 modifiers
892                  .add(Op1)
893                  .addImm(0) // src1 modifiers
894                  .add(Op0)
895                  .addImm(0); // omod
896   }
897 
898   MachineInstr *MaskUpdateMI =
899       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
900           .addReg(LiveMaskReg)
901           .addReg(VCC);
902 
903   // State of SCC represents whether any lanes are live in mask,
904   // if SCC is 0 then no lanes will be alive anymore.
905   MachineInstr *EarlyTermMI =
906       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
907 
908   MachineInstr *ExecMaskMI =
909       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
910 
911   assert(MBB.succ_size() == 1);
912   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
913                               .addMBB(*MBB.succ_begin());
914 
915   // Update live intervals
916   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
917   MBB.remove(&MI);
918 
919   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
920   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
921   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
922   LIS->InsertMachineInstrInMaps(*NewTerm);
923 
924   return NewTerm;
925 }
926 
927 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
928                                            MachineInstr &MI, bool IsWQM) {
929   const DebugLoc &DL = MI.getDebugLoc();
930   MachineInstr *MaskUpdateMI = nullptr;
931 
932   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
933   const MachineOperand &Op = MI.getOperand(0);
934   int64_t KillVal = MI.getOperand(1).getImm();
935   MachineInstr *ComputeKilledMaskMI = nullptr;
936   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
937   Register TmpReg;
938 
939   // Is this a static or dynamic kill?
940   if (Op.isImm()) {
941     if (Op.getImm() == KillVal) {
942       // Static: all active lanes are killed
943       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
944                          .addReg(LiveMaskReg)
945                          .addReg(Exec);
946     } else {
947       // Static: kill does nothing
948       MachineInstr *NewTerm = nullptr;
949       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
950         LIS->RemoveMachineInstrFromMaps(MI);
951       } else {
952         assert(MBB.succ_size() == 1);
953         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
954                       .addMBB(*MBB.succ_begin());
955         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
956       }
957       MBB.remove(&MI);
958       return NewTerm;
959     }
960   } else {
961     if (!KillVal) {
962       // Op represents live lanes after kill,
963       // so exec mask needs to be factored in.
964       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
965       ComputeKilledMaskMI =
966           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
967       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
968                          .addReg(LiveMaskReg)
969                          .addReg(TmpReg);
970     } else {
971       // Op represents lanes to kill
972       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
973                          .addReg(LiveMaskReg)
974                          .add(Op);
975     }
976   }
977 
978   // State of SCC represents whether any lanes are live in mask,
979   // if SCC is 0 then no lanes will be alive anymore.
980   MachineInstr *EarlyTermMI =
981       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
982 
983   // In the case we got this far some lanes are still live,
984   // update EXEC to deactivate lanes as appropriate.
985   MachineInstr *NewTerm;
986   MachineInstr *WQMMaskMI = nullptr;
987   Register LiveMaskWQM;
988   if (IsDemote) {
989     // Demote - deactivate quads with only helper lanes
990     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
991     WQMMaskMI =
992         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
993     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
994                   .addReg(Exec)
995                   .addReg(LiveMaskWQM);
996   } else {
997     // Kill - deactivate lanes no longer in live mask
998     if (Op.isImm()) {
999       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
1000       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
1001     } else if (!IsWQM) {
1002       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
1003                     .addReg(Exec)
1004                     .addReg(LiveMaskReg);
1005     } else {
1006       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
1007       NewTerm =
1008           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
1009     }
1010   }
1011 
1012   // Update live intervals
1013   LIS->RemoveMachineInstrFromMaps(MI);
1014   MBB.remove(&MI);
1015   assert(EarlyTermMI);
1016   assert(MaskUpdateMI);
1017   assert(NewTerm);
1018   if (ComputeKilledMaskMI)
1019     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1020   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1021   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1022   if (WQMMaskMI)
1023     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1024   LIS->InsertMachineInstrInMaps(*NewTerm);
1025 
1026   if (CndReg) {
1027     LIS->removeInterval(CndReg);
1028     LIS->createAndComputeVirtRegInterval(CndReg);
1029   }
1030   if (TmpReg)
1031     LIS->createAndComputeVirtRegInterval(TmpReg);
1032   if (LiveMaskWQM)
1033     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1034 
1035   return NewTerm;
1036 }
1037 
1038 // Convert a strict mode transition to a pseudo transition.
1039 // This still pre-allocates registers to prevent clobbering,
1040 // but avoids any EXEC mask changes.
1041 void SIWholeQuadMode::lowerPseudoStrictMode(MachineBasicBlock &MBB,
1042                                             MachineInstr *Entry,
1043                                             MachineInstr *Exit) {
1044   assert(Entry->getOpcode() == AMDGPU::ENTER_STRICT_WQM);
1045   assert(Exit->getOpcode() == AMDGPU::EXIT_STRICT_WQM);
1046 
1047   Register SaveOrig = Entry->getOperand(0).getReg();
1048 
1049   MachineInstr *NewEntry =
1050     BuildMI(MBB, Entry, DebugLoc(), TII->get(AMDGPU::ENTER_PSEUDO_WM));
1051   MachineInstr *NewExit =
1052     BuildMI(MBB, Exit, DebugLoc(), TII->get(AMDGPU::EXIT_PSEUDO_WM));
1053 
1054   LIS->ReplaceMachineInstrInMaps(*Exit, *NewExit);
1055   Exit->eraseFromParent();
1056 
1057   LIS->ReplaceMachineInstrInMaps(*Entry, *NewEntry);
1058   Entry->eraseFromParent();
1059 
1060   LIS->removeInterval(SaveOrig);
1061 }
1062 
1063 // Replace (or supplement) instructions accessing live mask.
1064 // This can only happen once all the live mask registers have been created
1065 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1066 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1067   auto BII = Blocks.find(&MBB);
1068   if (BII == Blocks.end())
1069     return;
1070 
1071   const BlockInfo &BI = BII->second;
1072   if (!BI.NeedsLowering)
1073     return;
1074 
1075   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1076 
1077   SmallVector<MachineInstr *, 4> SplitPoints;
1078   char State = BI.InitialState;
1079   MachineInstr *StrictEntry = nullptr;
1080 
1081   for (MachineInstr &MI : llvm::make_early_inc_range(
1082            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1083     char PreviousState = State;
1084 
1085     if (StateTransition.count(&MI))
1086       State = StateTransition[&MI];
1087 
1088     MachineInstr *SplitPoint = nullptr;
1089     switch (MI.getOpcode()) {
1090     case AMDGPU::SI_DEMOTE_I1:
1091     case AMDGPU::SI_KILL_I1_TERMINATOR:
1092       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1093       break;
1094     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1095       SplitPoint = lowerKillF32(MBB, MI);
1096       break;
1097     case AMDGPU::ENTER_STRICT_WQM:
1098       StrictEntry = PreviousState == StateWQM ? &MI : nullptr;
1099       break;
1100     case AMDGPU::EXIT_STRICT_WQM:
1101       if (State == StateWQM && StrictEntry) {
1102         // Transition WQM -> StrictWQM -> WQM detected.
1103         lowerPseudoStrictMode(MBB, StrictEntry, &MI);
1104       }
1105       StrictEntry = nullptr;
1106       break;
1107     case AMDGPU::ENTER_STRICT_WWM:
1108     case AMDGPU::EXIT_STRICT_WWM:
1109       StrictEntry = nullptr;
1110       break;
1111     default:
1112       break;
1113     }
1114     if (SplitPoint)
1115       SplitPoints.push_back(SplitPoint);
1116   }
1117 
1118   // Perform splitting after instruction scan to simplify iteration.
1119   if (!SplitPoints.empty()) {
1120     MachineBasicBlock *BB = &MBB;
1121     for (MachineInstr *MI : SplitPoints) {
1122       BB = splitBlock(BB, MI);
1123     }
1124   }
1125 }
1126 
1127 // Return an iterator in the (inclusive) range [First, Last] at which
1128 // instructions can be safely inserted, keeping in mind that some of the
1129 // instructions we want to add necessarily clobber SCC.
1130 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1131     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1132     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1133   if (!SaveSCC)
1134     return PreferLast ? Last : First;
1135 
1136   LiveRange &LR =
1137       LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
1138   auto MBBE = MBB.end();
1139   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1140                                      : LIS->getMBBEndIdx(&MBB);
1141   SlotIndex LastIdx =
1142       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1143   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1144   const LiveRange::Segment *S;
1145 
1146   for (;;) {
1147     S = LR.getSegmentContaining(Idx);
1148     if (!S)
1149       break;
1150 
1151     if (PreferLast) {
1152       SlotIndex Next = S->start.getBaseIndex();
1153       if (Next < FirstIdx)
1154         break;
1155       Idx = Next;
1156     } else {
1157       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1158       assert(EndMI && "Segment does not end on valid instruction");
1159       auto NextI = std::next(EndMI->getIterator());
1160       if (NextI == MBB.end())
1161         break;
1162       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1163       if (Next > LastIdx)
1164         break;
1165       Idx = Next;
1166     }
1167   }
1168 
1169   MachineBasicBlock::iterator MBBI;
1170 
1171   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1172     MBBI = MI;
1173   else {
1174     assert(Idx == LIS->getMBBEndIdx(&MBB));
1175     MBBI = MBB.end();
1176   }
1177 
1178   // Move insertion point past any operations modifying EXEC.
1179   // This assumes that the value of SCC defined by any of these operations
1180   // does not need to be preserved.
1181   while (MBBI != Last) {
1182     bool IsExecDef = false;
1183     for (const MachineOperand &MO : MBBI->all_defs()) {
1184       IsExecDef |=
1185           MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1186     }
1187     if (!IsExecDef)
1188       break;
1189     MBBI++;
1190     S = nullptr;
1191   }
1192 
1193   if (S)
1194     MBBI = saveSCC(MBB, MBBI);
1195 
1196   return MBBI;
1197 }
1198 
1199 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1200                               MachineBasicBlock::iterator Before,
1201                               Register SaveWQM) {
1202   bool IsTerminator = Before == MBB.end();
1203   if (!IsTerminator) {
1204     auto FirstTerm = MBB.getFirstTerminator();
1205     if (FirstTerm != MBB.end()) {
1206       SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
1207       SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
1208       IsTerminator = BeforeIdx > FirstTermIdx;
1209     }
1210   }
1211 
1212   MachineInstr *MI;
1213 
1214   if (SaveWQM) {
1215     unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
1216     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
1217              .addReg(LiveMaskReg);
1218   } else {
1219     unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
1220     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
1221              .addReg(Exec)
1222              .addReg(LiveMaskReg);
1223   }
1224 
1225   LIS->InsertMachineInstrInMaps(*MI);
1226   StateTransition[MI] = StateExact;
1227 }
1228 
1229 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1230                             MachineBasicBlock::iterator Before,
1231                             Register SavedWQM) {
1232   MachineInstr *MI;
1233 
1234   if (SavedWQM) {
1235     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1236              .addReg(SavedWQM);
1237   } else {
1238     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1239   }
1240 
1241   LIS->InsertMachineInstrInMaps(*MI);
1242   StateTransition[MI] = StateWQM;
1243 }
1244 
1245 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1246                                    MachineBasicBlock::iterator Before,
1247                                    Register SaveOrig, char StrictStateNeeded) {
1248   MachineInstr *MI;
1249   assert(SaveOrig);
1250   assert(StrictStateNeeded == StateStrictWWM ||
1251          StrictStateNeeded == StateStrictWQM);
1252 
1253   if (StrictStateNeeded == StateStrictWWM) {
1254     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1255                  SaveOrig)
1256              .addImm(-1);
1257   } else {
1258     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1259                  SaveOrig)
1260              .addImm(-1);
1261   }
1262   LIS->InsertMachineInstrInMaps(*MI);
1263   StateTransition[MI] = StrictStateNeeded;
1264 
1265   // Mark block as needing lower so it will be checked for unnecessary transitions.
1266   auto BII = Blocks.find(&MBB);
1267   if (BII != Blocks.end())
1268     BII->second.NeedsLowering = true;
1269 }
1270 
1271 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1272                                      MachineBasicBlock::iterator Before,
1273                                      Register SavedOrig, char NonStrictState,
1274                                      char CurrentStrictState) {
1275   MachineInstr *MI;
1276 
1277   assert(SavedOrig);
1278   assert(CurrentStrictState == StateStrictWWM ||
1279          CurrentStrictState == StateStrictWQM);
1280 
1281   if (CurrentStrictState == StateStrictWWM) {
1282     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1283                  Exec)
1284              .addReg(SavedOrig);
1285   } else {
1286     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1287                  Exec)
1288              .addReg(SavedOrig);
1289   }
1290   LIS->InsertMachineInstrInMaps(*MI);
1291   StateTransition[MI] = NonStrictState;
1292 }
1293 
1294 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1295   auto BII = Blocks.find(&MBB);
1296   if (BII == Blocks.end())
1297     return;
1298 
1299   BlockInfo &BI = BII->second;
1300 
1301   // This is a non-entry block that is WQM throughout, so no need to do
1302   // anything.
1303   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1304     BI.InitialState = StateWQM;
1305     return;
1306   }
1307 
1308   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1309                     << ":\n");
1310 
1311   Register SavedWQMReg;
1312   Register SavedNonStrictReg;
1313   bool WQMFromExec = IsEntry;
1314   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1315   char NonStrictState = 0;
1316   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1317 
1318   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1319   if (IsEntry) {
1320     // Skip the instruction that saves LiveMask
1321     if (II != IE && II->getOpcode() == AMDGPU::COPY &&
1322         II->getOperand(1).getReg() == TRI->getExec())
1323       ++II;
1324   }
1325 
1326   // This stores the first instruction where it's safe to switch from WQM to
1327   // Exact or vice versa.
1328   MachineBasicBlock::iterator FirstWQM = IE;
1329 
1330   // This stores the first instruction where it's safe to switch from Strict
1331   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1332   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1333   // be safe to switch to/from WQM as well.
1334   MachineBasicBlock::iterator FirstStrict = IE;
1335 
1336   // Record initial state is block information.
1337   BI.InitialState = State;
1338 
1339   for (;;) {
1340     MachineBasicBlock::iterator Next = II;
1341     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1342     char OutNeeds = 0;
1343 
1344     if (FirstWQM == IE)
1345       FirstWQM = II;
1346 
1347     if (FirstStrict == IE)
1348       FirstStrict = II;
1349 
1350     // First, figure out the allowed states (Needs) based on the propagated
1351     // flags.
1352     if (II != IE) {
1353       MachineInstr &MI = *II;
1354 
1355       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1356         auto III = Instructions.find(&MI);
1357         if (III != Instructions.end()) {
1358           if (III->second.Needs & StateStrictWWM)
1359             Needs = StateStrictWWM;
1360           else if (III->second.Needs & StateStrictWQM)
1361             Needs = StateStrictWQM;
1362           else if (III->second.Needs & StateWQM)
1363             Needs = StateWQM;
1364           else
1365             Needs &= ~III->second.Disabled;
1366           OutNeeds = III->second.OutNeeds;
1367         }
1368       } else {
1369         // If the instruction doesn't actually need a correct EXEC, then we can
1370         // safely leave Strict mode enabled.
1371         Needs = StateExact | StateWQM | StateStrict;
1372       }
1373 
1374       // Exact mode exit can occur in terminators, but must be before branches.
1375       if (MI.isBranch() && OutNeeds == StateExact)
1376         Needs = StateExact;
1377 
1378       ++Next;
1379     } else {
1380       // End of basic block
1381       if (BI.OutNeeds & StateWQM)
1382         Needs = StateWQM;
1383       else if (BI.OutNeeds == StateExact)
1384         Needs = StateExact;
1385       else
1386         Needs = StateWQM | StateExact;
1387     }
1388 
1389     // Now, transition if necessary.
1390     if (!(Needs & State)) {
1391       MachineBasicBlock::iterator First;
1392       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1393           State == StateStrictWQM || Needs == StateStrictWQM) {
1394         // We must switch to or from Strict mode.
1395         First = FirstStrict;
1396       } else {
1397         // We only need to switch to/from WQM, so we can use FirstWQM.
1398         First = FirstWQM;
1399       }
1400 
1401       // Whether we need to save SCC depends on start and end states.
1402       bool SaveSCC = false;
1403       switch (State) {
1404       case StateExact:
1405       case StateStrictWWM:
1406       case StateStrictWQM:
1407         // Exact/Strict -> Strict: save SCC
1408         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1409         // Exact/Strict -> Exact: no save
1410         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1411         break;
1412       case StateWQM:
1413         // WQM -> Exact/Strict: save SCC
1414         SaveSCC = !(Needs & StateWQM);
1415         break;
1416       default:
1417         llvm_unreachable("Unknown state");
1418         break;
1419       }
1420       MachineBasicBlock::iterator Before =
1421           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1422 
1423       if (State & StateStrict) {
1424         assert(State == StateStrictWWM || State == StateStrictWQM);
1425         assert(SavedNonStrictReg);
1426         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1427 
1428         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1429         SavedNonStrictReg = 0;
1430         State = NonStrictState;
1431       }
1432 
1433       if (Needs & StateStrict) {
1434         NonStrictState = State;
1435         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1436         assert(!SavedNonStrictReg);
1437         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1438 
1439         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1440         State = Needs;
1441 
1442       } else {
1443         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1444           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1445             assert(!SavedWQMReg);
1446             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1447           }
1448 
1449           toExact(MBB, Before, SavedWQMReg);
1450           State = StateExact;
1451         } else if (State == StateExact && (Needs & StateWQM) &&
1452                    !(Needs & StateExact)) {
1453           assert(WQMFromExec == (SavedWQMReg == 0));
1454 
1455           toWQM(MBB, Before, SavedWQMReg);
1456 
1457           if (SavedWQMReg) {
1458             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1459             SavedWQMReg = 0;
1460           }
1461           State = StateWQM;
1462         } else {
1463           // We can get here if we transitioned from StrictWWM to a
1464           // non-StrictWWM state that already matches our needs, but we
1465           // shouldn't need to do anything.
1466           assert(Needs & State);
1467         }
1468       }
1469     }
1470 
1471     if (Needs != (StateExact | StateWQM | StateStrict)) {
1472       if (Needs != (StateExact | StateWQM))
1473         FirstWQM = IE;
1474       FirstStrict = IE;
1475     }
1476 
1477     if (II == IE)
1478       break;
1479 
1480     II = Next;
1481   }
1482   assert(!SavedWQMReg);
1483   assert(!SavedNonStrictReg);
1484 }
1485 
1486 void SIWholeQuadMode::lowerLiveMaskQueries() {
1487   for (MachineInstr *MI : LiveMaskQueries) {
1488     const DebugLoc &DL = MI->getDebugLoc();
1489     Register Dest = MI->getOperand(0).getReg();
1490 
1491     MachineInstr *Copy =
1492         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1493             .addReg(LiveMaskReg);
1494 
1495     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1496     MI->eraseFromParent();
1497   }
1498 }
1499 
1500 void SIWholeQuadMode::lowerCopyInstrs() {
1501   for (MachineInstr *MI : LowerToMovInstrs) {
1502     assert(MI->getNumExplicitOperands() == 2);
1503 
1504     const Register Reg = MI->getOperand(0).getReg();
1505 
1506     const TargetRegisterClass *regClass =
1507         TRI->getRegClassForOperandReg(*MRI, MI->getOperand(0));
1508     if (TRI->isVGPRClass(regClass)) {
1509       const unsigned MovOp = TII->getMovOpcode(regClass);
1510       MI->setDesc(TII->get(MovOp));
1511 
1512       // Check that it already implicitly depends on exec (like all VALU movs
1513       // should do).
1514       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1515         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1516       }));
1517     } else {
1518       // Remove early-clobber and exec dependency from simple SGPR copies.
1519       // This allows some to be eliminated during/post RA.
1520       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1521       if (MI->getOperand(0).isEarlyClobber()) {
1522         LIS->removeInterval(Reg);
1523         MI->getOperand(0).setIsEarlyClobber(false);
1524         LIS->createAndComputeVirtRegInterval(Reg);
1525       }
1526       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1527       while (Index >= 0) {
1528         MI->removeOperand(Index);
1529         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1530       }
1531       MI->setDesc(TII->get(AMDGPU::COPY));
1532       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1533     }
1534   }
1535   for (MachineInstr *MI : LowerToCopyInstrs) {
1536     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1537         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1538       assert(MI->getNumExplicitOperands() == 3);
1539       // the only reason we should be here is V_SET_INACTIVE has
1540       // an undef input so it is being replaced by a simple copy.
1541       // There should be a second undef source that we should remove.
1542       assert(MI->getOperand(2).isUndef());
1543       MI->removeOperand(2);
1544       MI->untieRegOperand(1);
1545     } else {
1546       assert(MI->getNumExplicitOperands() == 2);
1547     }
1548 
1549     unsigned CopyOp = MI->getOperand(1).isReg()
1550                           ? (unsigned)AMDGPU::COPY
1551                           : TII->getMovOpcode(TRI->getRegClassForOperandReg(
1552                                 *MRI, MI->getOperand(0)));
1553     MI->setDesc(TII->get(CopyOp));
1554   }
1555 }
1556 
1557 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1558   for (MachineInstr *MI : KillInstrs) {
1559     MachineBasicBlock *MBB = MI->getParent();
1560     MachineInstr *SplitPoint = nullptr;
1561     switch (MI->getOpcode()) {
1562     case AMDGPU::SI_DEMOTE_I1:
1563     case AMDGPU::SI_KILL_I1_TERMINATOR:
1564       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1565       break;
1566     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1567       SplitPoint = lowerKillF32(*MBB, *MI);
1568       break;
1569     default:
1570       continue;
1571     }
1572     if (SplitPoint)
1573       splitBlock(MBB, SplitPoint);
1574   }
1575 }
1576 
1577 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1578   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1579                     << " ------------- \n");
1580   LLVM_DEBUG(MF.dump(););
1581 
1582   Instructions.clear();
1583   Blocks.clear();
1584   LiveMaskQueries.clear();
1585   LowerToCopyInstrs.clear();
1586   LowerToMovInstrs.clear();
1587   KillInstrs.clear();
1588   StateTransition.clear();
1589 
1590   ST = &MF.getSubtarget<GCNSubtarget>();
1591 
1592   TII = ST->getInstrInfo();
1593   TRI = &TII->getRegisterInfo();
1594   MRI = &MF.getRegInfo();
1595   LIS = &getAnalysis<LiveIntervals>();
1596   MDT = getAnalysisIfAvailable<MachineDominatorTree>();
1597   PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
1598 
1599   if (ST->isWave32()) {
1600     AndOpc = AMDGPU::S_AND_B32;
1601     AndTermOpc = AMDGPU::S_AND_B32_term;
1602     AndN2Opc = AMDGPU::S_ANDN2_B32;
1603     XorOpc = AMDGPU::S_XOR_B32;
1604     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1605     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
1606     WQMOpc = AMDGPU::S_WQM_B32;
1607     Exec = AMDGPU::EXEC_LO;
1608   } else {
1609     AndOpc = AMDGPU::S_AND_B64;
1610     AndTermOpc = AMDGPU::S_AND_B64_term;
1611     AndN2Opc = AMDGPU::S_ANDN2_B64;
1612     XorOpc = AMDGPU::S_XOR_B64;
1613     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1614     AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
1615     WQMOpc = AMDGPU::S_WQM_B64;
1616     Exec = AMDGPU::EXEC;
1617   }
1618 
1619   const char GlobalFlags = analyzeFunction(MF);
1620   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1621 
1622   LiveMaskReg = Exec;
1623 
1624   // Shader is simple does not need any state changes or any complex lowering
1625   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1626       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1627     lowerLiveMaskQueries();
1628     return !LiveMaskQueries.empty();
1629   }
1630 
1631   MachineBasicBlock &Entry = MF.front();
1632   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1633 
1634   // Store a copy of the original live mask when required
1635   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1636     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1637     MachineInstr *MI =
1638         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1639             .addReg(Exec);
1640     LIS->InsertMachineInstrInMaps(*MI);
1641   }
1642 
1643   LLVM_DEBUG(printInfo());
1644 
1645   lowerLiveMaskQueries();
1646   lowerCopyInstrs();
1647 
1648   // Shader only needs WQM
1649   if (GlobalFlags == StateWQM) {
1650     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1651                   .addReg(Exec);
1652     LIS->InsertMachineInstrInMaps(*MI);
1653     lowerKillInstrs(true);
1654   } else {
1655     for (auto BII : Blocks)
1656       processBlock(*BII.first, BII.first == &Entry);
1657     // Lowering blocks causes block splitting so perform as a second pass.
1658     for (auto BII : Blocks)
1659       lowerBlock(*BII.first);
1660   }
1661 
1662   // Compute live range for live mask
1663   if (LiveMaskReg != Exec)
1664     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1665 
1666   // Physical registers like SCC aren't tracked by default anyway, so just
1667   // removing the ranges we computed is the simplest option for maintaining
1668   // the analysis results.
1669   LIS->removeAllRegUnitsForPhysReg(AMDGPU::SCC);
1670 
1671   // If we performed any kills then recompute EXEC
1672   if (!KillInstrs.empty())
1673     LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
1674 
1675   return true;
1676 }
1677