1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode (strict or non-strict)
11 /// for pixel shaders, and strict whole wavefront mode for all programs.
12 ///
13 /// The "strict" prefix indicates that inactive lanes do not take part in
14 /// control flow, specifically an inactive lane enabled by a strict WQM/WWM will
15 /// always be enabled irrespective of control flow decisions. Conversely in
16 /// non-strict WQM inactive lanes may control flow decisions.
17 ///
18 /// Whole quad mode is required for derivative computations, but it interferes
19 /// with shader side effects (stores and atomics). It ensures that WQM is
20 /// enabled when necessary, but disabled around stores and atomics.
21 ///
22 /// When necessary, this pass creates a function prolog
23 ///
24 ///   S_MOV_B64 LiveMask, EXEC
25 ///   S_WQM_B64 EXEC, EXEC
26 ///
27 /// to enter WQM at the top of the function and surrounds blocks of Exact
28 /// instructions by
29 ///
30 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
31 ///   ...
32 ///   S_MOV_B64 EXEC, Tmp
33 ///
34 /// We also compute when a sequence of instructions requires strict whole
35 /// wavefront mode (StrictWWM) and insert instructions to save and restore it:
36 ///
37 ///   S_OR_SAVEEXEC_B64 Tmp, -1
38 ///   ...
39 ///   S_MOV_B64 EXEC, Tmp
40 ///
41 /// When a sequence of instructions requires strict whole quad mode (StrictWQM)
42 /// we use a similar save and restore mechanism and force whole quad mode for
43 /// those instructions:
44 ///
45 ///  S_MOV_B64 Tmp, EXEC
46 ///  S_WQM_B64 EXEC, EXEC
47 ///  ...
48 ///  S_MOV_B64 EXEC, Tmp
49 ///
50 /// In order to avoid excessive switching during sequences of Exact
51 /// instructions, the pass first analyzes which instructions must be run in WQM
52 /// (aka which instructions produce values that lead to derivative
53 /// computations).
54 ///
55 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
56 ///
57 /// There is room for improvement given better control flow analysis:
58 ///
59 ///  (1) at the top level (outside of control flow statements, and as long as
60 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
61 ///      the LiveMask (this is implemented for the entry block).
62 ///
63 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
64 ///      consist of exact and don't-care instructions, the switch only has to
65 ///      be done at the entry and exit points rather than potentially in each
66 ///      block of the region.
67 ///
68 //===----------------------------------------------------------------------===//
69 
70 #include "AMDGPU.h"
71 #include "GCNSubtarget.h"
72 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
73 #include "llvm/ADT/MapVector.h"
74 #include "llvm/ADT/PostOrderIterator.h"
75 #include "llvm/CodeGen/LiveIntervals.h"
76 #include "llvm/CodeGen/MachineBasicBlock.h"
77 #include "llvm/CodeGen/MachineDominators.h"
78 #include "llvm/CodeGen/MachineFunctionPass.h"
79 #include "llvm/CodeGen/MachineInstr.h"
80 #include "llvm/CodeGen/MachinePostDominators.h"
81 #include "llvm/IR/CallingConv.h"
82 #include "llvm/InitializePasses.h"
83 #include "llvm/Support/raw_ostream.h"
84 
85 using namespace llvm;
86 
87 #define DEBUG_TYPE "si-wqm"
88 
89 namespace {
90 
91 enum {
92   StateWQM = 0x1,
93   StateStrictWWM = 0x2,
94   StateStrictWQM = 0x4,
95   StateExact = 0x8,
96   StateStrict = StateStrictWWM | StateStrictWQM,
97 };
98 
99 struct PrintState {
100 public:
101   int State;
102 
103   explicit PrintState(int State) : State(State) {}
104 };
105 
106 #ifndef NDEBUG
107 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
108 
109   static const std::pair<char, const char *> Mapping[] = {
110       std::make_pair(StateWQM, "WQM"),
111       std::make_pair(StateStrictWWM, "StrictWWM"),
112       std::make_pair(StateStrictWQM, "StrictWQM"),
113       std::make_pair(StateExact, "Exact")};
114   char State = PS.State;
115   for (auto M : Mapping) {
116     if (State & M.first) {
117       OS << M.second;
118       State &= ~M.first;
119 
120       if (State)
121         OS << '|';
122     }
123   }
124   assert(State == 0);
125   return OS;
126 }
127 #endif
128 
129 struct InstrInfo {
130   char Needs = 0;
131   char Disabled = 0;
132   char OutNeeds = 0;
133 };
134 
135 struct BlockInfo {
136   char Needs = 0;
137   char InNeeds = 0;
138   char OutNeeds = 0;
139   char InitialState = 0;
140   bool NeedsLowering = false;
141 };
142 
143 struct WorkItem {
144   MachineBasicBlock *MBB = nullptr;
145   MachineInstr *MI = nullptr;
146 
147   WorkItem() = default;
148   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
149   WorkItem(MachineInstr *MI) : MI(MI) {}
150 };
151 
152 class SIWholeQuadMode : public MachineFunctionPass {
153 private:
154   const SIInstrInfo *TII;
155   const SIRegisterInfo *TRI;
156   const GCNSubtarget *ST;
157   MachineRegisterInfo *MRI;
158   LiveIntervals *LIS;
159   MachineDominatorTree *MDT;
160   MachinePostDominatorTree *PDT;
161 
162   unsigned AndOpc;
163   unsigned AndN2Opc;
164   unsigned XorOpc;
165   unsigned AndSaveExecOpc;
166   unsigned OrSaveExecOpc;
167   unsigned WQMOpc;
168   Register Exec;
169   Register LiveMaskReg;
170 
171   DenseMap<const MachineInstr *, InstrInfo> Instructions;
172   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
173 
174   // Tracks state (WQM/StrictWWM/StrictWQM/Exact) after a given instruction
175   DenseMap<const MachineInstr *, char> StateTransition;
176 
177   SmallVector<MachineInstr *, 2> LiveMaskQueries;
178   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
179   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
180   SmallVector<MachineInstr *, 4> KillInstrs;
181 
182   void printInfo();
183 
184   void markInstruction(MachineInstr &MI, char Flag,
185                        std::vector<WorkItem> &Worklist);
186   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
187                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
188   void markOperand(const MachineInstr &MI, const MachineOperand &Op, char Flag,
189                    std::vector<WorkItem> &Worklist);
190   void markInstructionUses(const MachineInstr &MI, char Flag,
191                            std::vector<WorkItem> &Worklist);
192   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
193   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
194   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
195   char analyzeFunction(MachineFunction &MF);
196 
197   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
198                                       MachineBasicBlock::iterator Before);
199   MachineBasicBlock::iterator
200   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
201                    MachineBasicBlock::iterator Last, bool PreferLast,
202                    bool SaveSCC);
203   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
204                Register SaveWQM);
205   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
206              Register SavedWQM);
207   void toStrictMode(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
208                     Register SaveOrig, char StrictStateNeeded);
209   void fromStrictMode(MachineBasicBlock &MBB,
210                       MachineBasicBlock::iterator Before, Register SavedOrig,
211                       char NonStrictState, char CurrentStrictState);
212 
213   MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
214 
215   MachineInstr *lowerKillI1(MachineBasicBlock &MBB, MachineInstr &MI,
216                             bool IsWQM);
217   MachineInstr *lowerKillF32(MachineBasicBlock &MBB, MachineInstr &MI);
218 
219   void lowerBlock(MachineBasicBlock &MBB);
220   void processBlock(MachineBasicBlock &MBB, bool IsEntry);
221 
222   void lowerLiveMaskQueries();
223   void lowerCopyInstrs();
224   void lowerKillInstrs(bool IsWQM);
225 
226 public:
227   static char ID;
228 
229   SIWholeQuadMode() :
230     MachineFunctionPass(ID) { }
231 
232   bool runOnMachineFunction(MachineFunction &MF) override;
233 
234   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
235 
236   void getAnalysisUsage(AnalysisUsage &AU) const override {
237     AU.addRequired<LiveIntervals>();
238     AU.addPreserved<SlotIndexes>();
239     AU.addPreserved<LiveIntervals>();
240     AU.addRequired<MachineDominatorTree>();
241     AU.addPreserved<MachineDominatorTree>();
242     AU.addRequired<MachinePostDominatorTree>();
243     AU.addPreserved<MachinePostDominatorTree>();
244     MachineFunctionPass::getAnalysisUsage(AU);
245   }
246 
247   MachineFunctionProperties getClearedProperties() const override {
248     return MachineFunctionProperties().set(
249         MachineFunctionProperties::Property::IsSSA);
250   }
251 };
252 
253 } // end anonymous namespace
254 
255 char SIWholeQuadMode::ID = 0;
256 
257 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
258                       false)
259 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
260 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
261 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
262 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
263                     false)
264 
265 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
266 
267 FunctionPass *llvm::createSIWholeQuadModePass() {
268   return new SIWholeQuadMode;
269 }
270 
271 #ifndef NDEBUG
272 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
273   for (const auto &BII : Blocks) {
274     dbgs() << "\n"
275            << printMBBReference(*BII.first) << ":\n"
276            << "  InNeeds = " << PrintState(BII.second.InNeeds)
277            << ", Needs = " << PrintState(BII.second.Needs)
278            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
279 
280     for (const MachineInstr &MI : *BII.first) {
281       auto III = Instructions.find(&MI);
282       if (III == Instructions.end())
283         continue;
284 
285       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
286              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
287     }
288   }
289 }
290 #endif
291 
292 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
293                                       std::vector<WorkItem> &Worklist) {
294   InstrInfo &II = Instructions[&MI];
295 
296   assert(!(Flag & StateExact) && Flag != 0);
297 
298   // Remove any disabled states from the flag. The user that required it gets
299   // an undefined value in the helper lanes. For example, this can happen if
300   // the result of an atomic is used by instruction that requires WQM, where
301   // ignoring the request for WQM is correct as per the relevant specs.
302   Flag &= ~II.Disabled;
303 
304   // Ignore if the flag is already encompassed by the existing needs, or we
305   // just disabled everything.
306   if ((II.Needs & Flag) == Flag)
307     return;
308 
309   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
310   II.Needs |= Flag;
311   Worklist.push_back(&MI);
312 }
313 
314 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
315 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
316                                Register Reg, unsigned SubReg, char Flag,
317                                std::vector<WorkItem> &Worklist) {
318   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
319 
320   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
321   const VNInfo *Value = UseLRQ.valueIn();
322   if (!Value)
323     return;
324 
325   // Note: this code assumes that lane masks on AMDGPU completely
326   // cover registers.
327   const LaneBitmask UseLanes =
328       SubReg ? TRI->getSubRegIndexLaneMask(SubReg)
329              : (Reg.isVirtual() ? MRI->getMaxLaneMaskForVReg(Reg)
330                                 : LaneBitmask::getNone());
331 
332   // Perform a depth-first iteration of the LiveRange graph marking defs.
333   // Stop processing of a given branch when all use lanes have been defined.
334   // The first definition stops processing for a physical register.
335   struct PhiEntry {
336     const VNInfo *Phi;
337     unsigned PredIdx;
338     LaneBitmask DefinedLanes;
339 
340     PhiEntry(const VNInfo *Phi, unsigned PredIdx, LaneBitmask DefinedLanes)
341         : Phi(Phi), PredIdx(PredIdx), DefinedLanes(DefinedLanes) {}
342   };
343   using VisitKey = std::pair<const VNInfo *, LaneBitmask>;
344   SmallVector<PhiEntry, 2> PhiStack;
345   SmallSet<VisitKey, 4> Visited;
346   LaneBitmask DefinedLanes;
347   unsigned NextPredIdx = 0; // Only used for processing phi nodes
348   do {
349     const VNInfo *NextValue = nullptr;
350     const VisitKey Key(Value, DefinedLanes);
351 
352     if (!Visited.count(Key)) {
353       Visited.insert(Key);
354       // On first visit to a phi then start processing first predecessor
355       NextPredIdx = 0;
356     }
357 
358     if (Value->isPHIDef()) {
359       // Each predecessor node in the phi must be processed as a subgraph
360       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
361       assert(MBB && "Phi-def has no defining MBB");
362 
363       // Find next predecessor to process
364       unsigned Idx = NextPredIdx;
365       auto PI = MBB->pred_begin() + Idx;
366       auto PE = MBB->pred_end();
367       for (; PI != PE && !NextValue; ++PI, ++Idx) {
368         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
369           if (!Visited.count(VisitKey(VN, DefinedLanes)))
370             NextValue = VN;
371         }
372       }
373 
374       // If there are more predecessors to process; add phi to stack
375       if (PI != PE)
376         PhiStack.emplace_back(Value, Idx, DefinedLanes);
377     } else {
378       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
379       assert(MI && "Def has no defining instruction");
380 
381       if (Reg.isVirtual()) {
382         // Iterate over all operands to find relevant definitions
383         bool HasDef = false;
384         for (const MachineOperand &Op : MI->operands()) {
385           if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
386             continue;
387 
388           // Compute lanes defined and overlap with use
389           LaneBitmask OpLanes =
390               Op.isUndef() ? LaneBitmask::getAll()
391                            : TRI->getSubRegIndexLaneMask(Op.getSubReg());
392           LaneBitmask Overlap = (UseLanes & OpLanes);
393 
394           // Record if this instruction defined any of use
395           HasDef |= Overlap.any();
396 
397           // Mark any lanes defined
398           DefinedLanes |= OpLanes;
399         }
400 
401         // Check if all lanes of use have been defined
402         if ((DefinedLanes & UseLanes) != UseLanes) {
403           // Definition not complete; need to process input value
404           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
405           if (const VNInfo *VN = LRQ.valueIn()) {
406             if (!Visited.count(VisitKey(VN, DefinedLanes)))
407               NextValue = VN;
408           }
409         }
410 
411         // Only mark the instruction if it defines some part of the use
412         if (HasDef)
413           markInstruction(*MI, Flag, Worklist);
414       } else {
415         // For physical registers simply mark the defining instruction
416         markInstruction(*MI, Flag, Worklist);
417       }
418     }
419 
420     if (!NextValue && !PhiStack.empty()) {
421       // Reach end of chain; revert to processing last phi
422       PhiEntry &Entry = PhiStack.back();
423       NextValue = Entry.Phi;
424       NextPredIdx = Entry.PredIdx;
425       DefinedLanes = Entry.DefinedLanes;
426       PhiStack.pop_back();
427     }
428 
429     Value = NextValue;
430   } while (Value);
431 }
432 
433 void SIWholeQuadMode::markOperand(const MachineInstr &MI,
434                                   const MachineOperand &Op, char Flag,
435                                   std::vector<WorkItem> &Worklist) {
436   assert(Op.isReg());
437   Register Reg = Op.getReg();
438 
439   // Ignore some hardware registers
440   switch (Reg) {
441   case AMDGPU::EXEC:
442   case AMDGPU::EXEC_LO:
443     return;
444   default:
445     break;
446   }
447 
448   LLVM_DEBUG(dbgs() << "markOperand " << PrintState(Flag) << ": " << Op
449                     << " for " << MI);
450   if (Reg.isVirtual()) {
451     LiveRange &LR = LIS->getInterval(Reg);
452     markDefs(MI, LR, Reg, Op.getSubReg(), Flag, Worklist);
453   } else {
454     // Handle physical registers that we need to track; this is mostly relevant
455     // for VCC, which can appear as the (implicit) input of a uniform branch,
456     // e.g. when a loop counter is stored in a VGPR.
457     for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
458          ++RegUnit) {
459       LiveRange &LR = LIS->getRegUnit(*RegUnit);
460       const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
461       if (!Value)
462         continue;
463 
464       markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
465     }
466   }
467 }
468 
469 /// Mark all instructions defining the uses in \p MI with \p Flag.
470 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
471                                           std::vector<WorkItem> &Worklist) {
472   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
473                     << MI);
474 
475   for (const MachineOperand &Use : MI.uses()) {
476     if (!Use.isReg() || !Use.isUse())
477       continue;
478     markOperand(MI, Use, Flag, Worklist);
479   }
480 }
481 
482 // Scan instructions to determine which ones require an Exact execmask and
483 // which ones seed WQM requirements.
484 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
485                                        std::vector<WorkItem> &Worklist) {
486   char GlobalFlags = 0;
487   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
489   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490   bool HasImplicitDerivatives =
491       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
492 
493   // We need to visit the basic blocks in reverse post-order so that we visit
494   // defs before uses, in particular so that we don't accidentally mark an
495   // instruction as needing e.g. WQM before visiting it and realizing it needs
496   // WQM disabled.
497   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
498   for (MachineBasicBlock *MBB : RPOT) {
499     BlockInfo &BBI = Blocks[MBB];
500 
501     for (MachineInstr &MI : *MBB) {
502       InstrInfo &III = Instructions[&MI];
503       unsigned Opcode = MI.getOpcode();
504       char Flags = 0;
505 
506       if (TII->isWQM(Opcode)) {
507         // If LOD is not supported WQM is not needed.
508         if (!ST->hasExtendedImageInsts())
509           continue;
510         // Only generate implicit WQM if implicit derivatives are required.
511         // This avoids inserting unintended WQM if a shader type without
512         // implicit derivatives uses an image sampling instruction.
513         if (!HasImplicitDerivatives)
514           continue;
515         // Sampling instructions don't need to produce results for all pixels
516         // in a quad, they just require all inputs of a quad to have been
517         // computed for derivatives.
518         markInstructionUses(MI, StateWQM, Worklist);
519         GlobalFlags |= StateWQM;
520         continue;
521       } else if (Opcode == AMDGPU::WQM) {
522         // The WQM intrinsic requires its output to have all the helper lanes
523         // correct, so we need it to be in WQM.
524         Flags = StateWQM;
525         LowerToCopyInstrs.push_back(&MI);
526       } else if (Opcode == AMDGPU::SOFT_WQM) {
527         LowerToCopyInstrs.push_back(&MI);
528         SoftWQMInstrs.push_back(&MI);
529         continue;
530       } else if (Opcode == AMDGPU::STRICT_WWM) {
531         // The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
532         // it needs to be executed in WQM or Exact so that its copy doesn't
533         // clobber inactive lanes.
534         markInstructionUses(MI, StateStrictWWM, Worklist);
535         GlobalFlags |= StateStrictWWM;
536         LowerToMovInstrs.push_back(&MI);
537         continue;
538       } else if (Opcode == AMDGPU::STRICT_WQM) {
539         // STRICT_WQM is similar to STRICTWWM, but instead of enabling all
540         // threads of the wave like STRICTWWM, STRICT_WQM enables all threads in
541         // quads that have at least one active thread.
542         markInstructionUses(MI, StateStrictWQM, Worklist);
543         GlobalFlags |= StateStrictWQM;
544         LowerToMovInstrs.push_back(&MI);
545         continue;
546       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
547                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
548         III.Disabled = StateStrict;
549         MachineOperand &Inactive = MI.getOperand(2);
550         if (Inactive.isReg()) {
551           if (Inactive.isUndef()) {
552             LowerToCopyInstrs.push_back(&MI);
553           } else {
554             markOperand(MI, Inactive, StateStrictWWM, Worklist);
555           }
556         }
557         SetInactiveInstrs.push_back(&MI);
558         continue;
559       } else if (TII->isDisableWQM(MI)) {
560         BBI.Needs |= StateExact;
561         if (!(BBI.InNeeds & StateExact)) {
562           BBI.InNeeds |= StateExact;
563           Worklist.push_back(MBB);
564         }
565         GlobalFlags |= StateExact;
566         III.Disabled = StateWQM | StateStrict;
567         continue;
568       } else {
569         if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
570           LiveMaskQueries.push_back(&MI);
571         } else if (Opcode == AMDGPU::SI_KILL_I1_TERMINATOR ||
572                    Opcode == AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR ||
573                    Opcode == AMDGPU::SI_DEMOTE_I1) {
574           KillInstrs.push_back(&MI);
575           BBI.NeedsLowering = true;
576         } else if (WQMOutputs) {
577           // The function is in machine SSA form, which means that physical
578           // VGPRs correspond to shader inputs and outputs. Inputs are
579           // only used, outputs are only defined.
580           // FIXME: is this still valid?
581           for (const MachineOperand &MO : MI.defs()) {
582             if (!MO.isReg())
583               continue;
584 
585             Register Reg = MO.getReg();
586 
587             if (!Reg.isVirtual() &&
588                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
589               Flags = StateWQM;
590               break;
591             }
592           }
593         }
594 
595         if (!Flags)
596           continue;
597       }
598 
599       markInstruction(MI, Flags, Worklist);
600       GlobalFlags |= Flags;
601     }
602   }
603 
604   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
605   // ever used anywhere in the function. This implements the corresponding
606   // semantics of @llvm.amdgcn.set.inactive.
607   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
608   if (GlobalFlags & StateWQM) {
609     for (MachineInstr *MI : SetInactiveInstrs)
610       markInstruction(*MI, StateWQM, Worklist);
611     for (MachineInstr *MI : SoftWQMInstrs)
612       markInstruction(*MI, StateWQM, Worklist);
613   }
614 
615   return GlobalFlags;
616 }
617 
618 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
619                                            std::vector<WorkItem>& Worklist) {
620   MachineBasicBlock *MBB = MI.getParent();
621   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
622   BlockInfo &BI = Blocks[MBB];
623 
624   // Control flow-type instructions and stores to temporary memory that are
625   // followed by WQM computations must themselves be in WQM.
626   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
627       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
628     Instructions[&MI].Needs = StateWQM;
629     II.Needs = StateWQM;
630   }
631 
632   // Propagate to block level
633   if (II.Needs & StateWQM) {
634     BI.Needs |= StateWQM;
635     if (!(BI.InNeeds & StateWQM)) {
636       BI.InNeeds |= StateWQM;
637       Worklist.push_back(MBB);
638     }
639   }
640 
641   // Propagate backwards within block
642   if (MachineInstr *PrevMI = MI.getPrevNode()) {
643     char InNeeds = (II.Needs & ~StateStrict) | II.OutNeeds;
644     if (!PrevMI->isPHI()) {
645       InstrInfo &PrevII = Instructions[PrevMI];
646       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
647         PrevII.OutNeeds |= InNeeds;
648         Worklist.push_back(PrevMI);
649       }
650     }
651   }
652 
653   // Propagate WQM flag to instruction inputs
654   assert(!(II.Needs & StateExact));
655 
656   if (II.Needs != 0)
657     markInstructionUses(MI, II.Needs, Worklist);
658 
659   // Ensure we process a block containing StrictWWM/StrictWQM, even if it does
660   // not require any WQM transitions.
661   if (II.Needs & StateStrictWWM)
662     BI.Needs |= StateStrictWWM;
663   if (II.Needs & StateStrictWQM)
664     BI.Needs |= StateStrictWQM;
665 }
666 
667 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
668                                      std::vector<WorkItem>& Worklist) {
669   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
670 
671   // Propagate through instructions
672   if (!MBB.empty()) {
673     MachineInstr *LastMI = &*MBB.rbegin();
674     InstrInfo &LastII = Instructions[LastMI];
675     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
676       LastII.OutNeeds |= BI.OutNeeds;
677       Worklist.push_back(LastMI);
678     }
679   }
680 
681   // Predecessor blocks must provide for our WQM/Exact needs.
682   for (MachineBasicBlock *Pred : MBB.predecessors()) {
683     BlockInfo &PredBI = Blocks[Pred];
684     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
685       continue;
686 
687     PredBI.OutNeeds |= BI.InNeeds;
688     PredBI.InNeeds |= BI.InNeeds;
689     Worklist.push_back(Pred);
690   }
691 
692   // All successors must be prepared to accept the same set of WQM/Exact data.
693   for (MachineBasicBlock *Succ : MBB.successors()) {
694     BlockInfo &SuccBI = Blocks[Succ];
695     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
696       continue;
697 
698     SuccBI.InNeeds |= BI.OutNeeds;
699     Worklist.push_back(Succ);
700   }
701 }
702 
703 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
704   std::vector<WorkItem> Worklist;
705   char GlobalFlags = scanInstructions(MF, Worklist);
706 
707   while (!Worklist.empty()) {
708     WorkItem WI = Worklist.back();
709     Worklist.pop_back();
710 
711     if (WI.MI)
712       propagateInstruction(*WI.MI, Worklist);
713     else
714       propagateBlock(*WI.MBB, Worklist);
715   }
716 
717   return GlobalFlags;
718 }
719 
720 MachineBasicBlock::iterator
721 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
722                          MachineBasicBlock::iterator Before) {
723   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
724 
725   MachineInstr *Save =
726       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
727           .addReg(AMDGPU::SCC);
728   MachineInstr *Restore =
729       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
730           .addReg(SaveReg);
731 
732   LIS->InsertMachineInstrInMaps(*Save);
733   LIS->InsertMachineInstrInMaps(*Restore);
734   LIS->createAndComputeVirtRegInterval(SaveReg);
735 
736   return Restore;
737 }
738 
739 MachineBasicBlock *SIWholeQuadMode::splitBlock(MachineBasicBlock *BB,
740                                                MachineInstr *TermMI) {
741   LLVM_DEBUG(dbgs() << "Split block " << printMBBReference(*BB) << " @ "
742                     << *TermMI << "\n");
743 
744   MachineBasicBlock *SplitBB =
745       BB->splitAt(*TermMI, /*UpdateLiveIns*/ true, LIS);
746 
747   // Convert last instruction in block to a terminator.
748   // Note: this only covers the expected patterns
749   unsigned NewOpcode = 0;
750   switch (TermMI->getOpcode()) {
751   case AMDGPU::S_AND_B32:
752     NewOpcode = AMDGPU::S_AND_B32_term;
753     break;
754   case AMDGPU::S_AND_B64:
755     NewOpcode = AMDGPU::S_AND_B64_term;
756     break;
757   case AMDGPU::S_MOV_B32:
758     NewOpcode = AMDGPU::S_MOV_B32_term;
759     break;
760   case AMDGPU::S_MOV_B64:
761     NewOpcode = AMDGPU::S_MOV_B64_term;
762     break;
763   default:
764     break;
765   }
766   if (NewOpcode)
767     TermMI->setDesc(TII->get(NewOpcode));
768 
769   if (SplitBB != BB) {
770     // Update dominator trees
771     using DomTreeT = DomTreeBase<MachineBasicBlock>;
772     SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
773     for (MachineBasicBlock *Succ : SplitBB->successors()) {
774       DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
775       DTUpdates.push_back({DomTreeT::Delete, BB, Succ});
776     }
777     DTUpdates.push_back({DomTreeT::Insert, BB, SplitBB});
778     if (MDT)
779       MDT->getBase().applyUpdates(DTUpdates);
780     if (PDT)
781       PDT->getBase().applyUpdates(DTUpdates);
782 
783     // Link blocks
784     MachineInstr *MI =
785         BuildMI(*BB, BB->end(), DebugLoc(), TII->get(AMDGPU::S_BRANCH))
786             .addMBB(SplitBB);
787     LIS->InsertMachineInstrInMaps(*MI);
788   }
789 
790   return SplitBB;
791 }
792 
793 MachineInstr *SIWholeQuadMode::lowerKillF32(MachineBasicBlock &MBB,
794                                             MachineInstr &MI) {
795   const DebugLoc &DL = MI.getDebugLoc();
796   unsigned Opcode = 0;
797 
798   assert(MI.getOperand(0).isReg());
799 
800   // Comparison is for live lanes; however here we compute the inverse
801   // (killed lanes).  This is because VCMP will always generate 0 bits
802   // for inactive lanes so a mask of live lanes would not be correct
803   // inside control flow.
804   // Invert the comparison by swapping the operands and adjusting
805   // the comparison codes.
806 
807   switch (MI.getOperand(2).getImm()) {
808   case ISD::SETUEQ:
809     Opcode = AMDGPU::V_CMP_LG_F32_e64;
810     break;
811   case ISD::SETUGT:
812     Opcode = AMDGPU::V_CMP_GE_F32_e64;
813     break;
814   case ISD::SETUGE:
815     Opcode = AMDGPU::V_CMP_GT_F32_e64;
816     break;
817   case ISD::SETULT:
818     Opcode = AMDGPU::V_CMP_LE_F32_e64;
819     break;
820   case ISD::SETULE:
821     Opcode = AMDGPU::V_CMP_LT_F32_e64;
822     break;
823   case ISD::SETUNE:
824     Opcode = AMDGPU::V_CMP_EQ_F32_e64;
825     break;
826   case ISD::SETO:
827     Opcode = AMDGPU::V_CMP_O_F32_e64;
828     break;
829   case ISD::SETUO:
830     Opcode = AMDGPU::V_CMP_U_F32_e64;
831     break;
832   case ISD::SETOEQ:
833   case ISD::SETEQ:
834     Opcode = AMDGPU::V_CMP_NEQ_F32_e64;
835     break;
836   case ISD::SETOGT:
837   case ISD::SETGT:
838     Opcode = AMDGPU::V_CMP_NLT_F32_e64;
839     break;
840   case ISD::SETOGE:
841   case ISD::SETGE:
842     Opcode = AMDGPU::V_CMP_NLE_F32_e64;
843     break;
844   case ISD::SETOLT:
845   case ISD::SETLT:
846     Opcode = AMDGPU::V_CMP_NGT_F32_e64;
847     break;
848   case ISD::SETOLE:
849   case ISD::SETLE:
850     Opcode = AMDGPU::V_CMP_NGE_F32_e64;
851     break;
852   case ISD::SETONE:
853   case ISD::SETNE:
854     Opcode = AMDGPU::V_CMP_NLG_F32_e64;
855     break;
856   default:
857     llvm_unreachable("invalid ISD:SET cond code");
858   }
859 
860   // Pick opcode based on comparison type.
861   MachineInstr *VcmpMI;
862   const MachineOperand &Op0 = MI.getOperand(0);
863   const MachineOperand &Op1 = MI.getOperand(1);
864 
865   // VCC represents lanes killed.
866   Register VCC = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
867 
868   if (TRI->isVGPR(*MRI, Op0.getReg())) {
869     Opcode = AMDGPU::getVOPe32(Opcode);
870     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode)).add(Op1).add(Op0);
871   } else {
872     VcmpMI = BuildMI(MBB, &MI, DL, TII->get(Opcode))
873                  .addReg(VCC, RegState::Define)
874                  .addImm(0) // src0 modifiers
875                  .add(Op1)
876                  .addImm(0) // src1 modifiers
877                  .add(Op0)
878                  .addImm(0); // omod
879   }
880 
881   MachineInstr *MaskUpdateMI =
882       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
883           .addReg(LiveMaskReg)
884           .addReg(VCC);
885 
886   // State of SCC represents whether any lanes are live in mask,
887   // if SCC is 0 then no lanes will be alive anymore.
888   MachineInstr *EarlyTermMI =
889       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
890 
891   MachineInstr *ExecMaskMI =
892       BuildMI(MBB, MI, DL, TII->get(AndN2Opc), Exec).addReg(Exec).addReg(VCC);
893 
894   assert(MBB.succ_size() == 1);
895   MachineInstr *NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
896                               .addMBB(*MBB.succ_begin());
897 
898   // Update live intervals
899   LIS->ReplaceMachineInstrInMaps(MI, *VcmpMI);
900   MBB.remove(&MI);
901 
902   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
903   LIS->InsertMachineInstrInMaps(*ExecMaskMI);
904   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
905   LIS->InsertMachineInstrInMaps(*NewTerm);
906 
907   return NewTerm;
908 }
909 
910 MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
911                                            MachineInstr &MI, bool IsWQM) {
912   const DebugLoc &DL = MI.getDebugLoc();
913   MachineInstr *MaskUpdateMI = nullptr;
914 
915   const bool IsDemote = IsWQM && (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1);
916   const MachineOperand &Op = MI.getOperand(0);
917   int64_t KillVal = MI.getOperand(1).getImm();
918   MachineInstr *ComputeKilledMaskMI = nullptr;
919   Register CndReg = !Op.isImm() ? Op.getReg() : Register();
920   Register TmpReg;
921 
922   // Is this a static or dynamic kill?
923   if (Op.isImm()) {
924     if (Op.getImm() == KillVal) {
925       // Static: all active lanes are killed
926       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
927                          .addReg(LiveMaskReg)
928                          .addReg(Exec);
929     } else {
930       // Static: kill does nothing
931       MachineInstr *NewTerm = nullptr;
932       if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
933         LIS->RemoveMachineInstrFromMaps(MI);
934       } else {
935         assert(MBB.succ_size() == 1);
936         NewTerm = BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_BRANCH))
937                       .addMBB(*MBB.succ_begin());
938         LIS->ReplaceMachineInstrInMaps(MI, *NewTerm);
939       }
940       MBB.remove(&MI);
941       return NewTerm;
942     }
943   } else {
944     if (!KillVal) {
945       // Op represents live lanes after kill,
946       // so exec mask needs to be factored in.
947       TmpReg = MRI->createVirtualRegister(TRI->getBoolRC());
948       ComputeKilledMaskMI =
949           BuildMI(MBB, MI, DL, TII->get(XorOpc), TmpReg).add(Op).addReg(Exec);
950       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
951                          .addReg(LiveMaskReg)
952                          .addReg(TmpReg);
953     } else {
954       // Op represents lanes to kill
955       MaskUpdateMI = BuildMI(MBB, MI, DL, TII->get(AndN2Opc), LiveMaskReg)
956                          .addReg(LiveMaskReg)
957                          .add(Op);
958     }
959   }
960 
961   // State of SCC represents whether any lanes are live in mask,
962   // if SCC is 0 then no lanes will be alive anymore.
963   MachineInstr *EarlyTermMI =
964       BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_EARLY_TERMINATE_SCC0));
965 
966   // In the case we got this far some lanes are still live,
967   // update EXEC to deactivate lanes as appropriate.
968   MachineInstr *NewTerm;
969   MachineInstr *WQMMaskMI = nullptr;
970   Register LiveMaskWQM;
971   if (IsDemote) {
972     // Demotes deactive quads with only helper lanes
973     LiveMaskWQM = MRI->createVirtualRegister(TRI->getBoolRC());
974     WQMMaskMI =
975         BuildMI(MBB, MI, DL, TII->get(WQMOpc), LiveMaskWQM).addReg(LiveMaskReg);
976     NewTerm = BuildMI(MBB, MI, DL, TII->get(AndOpc), Exec)
977                   .addReg(Exec)
978                   .addReg(LiveMaskWQM);
979   } else {
980     // Kills deactivate lanes
981     if (Op.isImm()) {
982       unsigned MovOpc = ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
983       NewTerm = BuildMI(MBB, &MI, DL, TII->get(MovOpc), Exec).addImm(0);
984     } else if (!IsWQM) {
985       NewTerm = BuildMI(MBB, &MI, DL, TII->get(AndOpc), Exec)
986                     .addReg(Exec)
987                     .addReg(LiveMaskReg);
988     } else {
989       unsigned Opcode = KillVal ? AndN2Opc : AndOpc;
990       NewTerm =
991           BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec).addReg(Exec).add(Op);
992     }
993   }
994 
995   // Update live intervals
996   LIS->RemoveMachineInstrFromMaps(MI);
997   MBB.remove(&MI);
998   assert(EarlyTermMI);
999   assert(MaskUpdateMI);
1000   assert(NewTerm);
1001   if (ComputeKilledMaskMI)
1002     LIS->InsertMachineInstrInMaps(*ComputeKilledMaskMI);
1003   LIS->InsertMachineInstrInMaps(*MaskUpdateMI);
1004   LIS->InsertMachineInstrInMaps(*EarlyTermMI);
1005   if (WQMMaskMI)
1006     LIS->InsertMachineInstrInMaps(*WQMMaskMI);
1007   LIS->InsertMachineInstrInMaps(*NewTerm);
1008 
1009   if (CndReg) {
1010     LIS->removeInterval(CndReg);
1011     LIS->createAndComputeVirtRegInterval(CndReg);
1012   }
1013   if (TmpReg)
1014     LIS->createAndComputeVirtRegInterval(TmpReg);
1015   if (LiveMaskWQM)
1016     LIS->createAndComputeVirtRegInterval(LiveMaskWQM);
1017 
1018   return NewTerm;
1019 }
1020 
1021 // Replace (or supplement) instructions accessing live mask.
1022 // This can only happen once all the live mask registers have been created
1023 // and the execute state (WQM/StrictWWM/Exact) of instructions is known.
1024 void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
1025   auto BII = Blocks.find(&MBB);
1026   if (BII == Blocks.end())
1027     return;
1028 
1029   const BlockInfo &BI = BII->second;
1030   if (!BI.NeedsLowering)
1031     return;
1032 
1033   LLVM_DEBUG(dbgs() << "\nLowering block " << printMBBReference(MBB) << ":\n");
1034 
1035   SmallVector<MachineInstr *, 4> SplitPoints;
1036   char State = BI.InitialState;
1037 
1038   for (MachineInstr &MI : llvm::make_early_inc_range(
1039            llvm::make_range(MBB.getFirstNonPHI(), MBB.end()))) {
1040     if (StateTransition.count(&MI))
1041       State = StateTransition[&MI];
1042 
1043     MachineInstr *SplitPoint = nullptr;
1044     switch (MI.getOpcode()) {
1045     case AMDGPU::SI_DEMOTE_I1:
1046     case AMDGPU::SI_KILL_I1_TERMINATOR:
1047       SplitPoint = lowerKillI1(MBB, MI, State == StateWQM);
1048       break;
1049     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1050       SplitPoint = lowerKillF32(MBB, MI);
1051       break;
1052     default:
1053       break;
1054     }
1055     if (SplitPoint)
1056       SplitPoints.push_back(SplitPoint);
1057   }
1058 
1059   // Perform splitting after instruction scan to simplify iteration.
1060   if (!SplitPoints.empty()) {
1061     MachineBasicBlock *BB = &MBB;
1062     for (MachineInstr *MI : SplitPoints) {
1063       BB = splitBlock(BB, MI);
1064     }
1065   }
1066 }
1067 
1068 // Return an iterator in the (inclusive) range [First, Last] at which
1069 // instructions can be safely inserted, keeping in mind that some of the
1070 // instructions we want to add necessarily clobber SCC.
1071 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
1072     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
1073     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
1074   if (!SaveSCC)
1075     return PreferLast ? Last : First;
1076 
1077   LiveRange &LR =
1078       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1079   auto MBBE = MBB.end();
1080   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
1081                                      : LIS->getMBBEndIdx(&MBB);
1082   SlotIndex LastIdx =
1083       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
1084   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
1085   const LiveRange::Segment *S;
1086 
1087   for (;;) {
1088     S = LR.getSegmentContaining(Idx);
1089     if (!S)
1090       break;
1091 
1092     if (PreferLast) {
1093       SlotIndex Next = S->start.getBaseIndex();
1094       if (Next < FirstIdx)
1095         break;
1096       Idx = Next;
1097     } else {
1098       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
1099       assert(EndMI && "Segment does not end on valid instruction");
1100       auto NextI = std::next(EndMI->getIterator());
1101       if (NextI == MBB.end())
1102         break;
1103       SlotIndex Next = LIS->getInstructionIndex(*NextI);
1104       if (Next > LastIdx)
1105         break;
1106       Idx = Next;
1107     }
1108   }
1109 
1110   MachineBasicBlock::iterator MBBI;
1111 
1112   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
1113     MBBI = MI;
1114   else {
1115     assert(Idx == LIS->getMBBEndIdx(&MBB));
1116     MBBI = MBB.end();
1117   }
1118 
1119   // Move insertion point past any operations modifying EXEC.
1120   // This assumes that the value of SCC defined by any of these operations
1121   // does not need to be preserved.
1122   while (MBBI != Last) {
1123     bool IsExecDef = false;
1124     for (const MachineOperand &MO : MBBI->operands()) {
1125       if (MO.isReg() && MO.isDef()) {
1126         IsExecDef |=
1127             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
1128       }
1129     }
1130     if (!IsExecDef)
1131       break;
1132     MBBI++;
1133     S = nullptr;
1134   }
1135 
1136   if (S)
1137     MBBI = saveSCC(MBB, MBBI);
1138 
1139   return MBBI;
1140 }
1141 
1142 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
1143                               MachineBasicBlock::iterator Before,
1144                               Register SaveWQM) {
1145   MachineInstr *MI;
1146 
1147   if (SaveWQM) {
1148     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
1149              .addReg(LiveMaskReg);
1150   } else {
1151     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
1152              .addReg(Exec)
1153              .addReg(LiveMaskReg);
1154   }
1155 
1156   LIS->InsertMachineInstrInMaps(*MI);
1157   StateTransition[MI] = StateExact;
1158 }
1159 
1160 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
1161                             MachineBasicBlock::iterator Before,
1162                             Register SavedWQM) {
1163   MachineInstr *MI;
1164 
1165   if (SavedWQM) {
1166     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
1167              .addReg(SavedWQM);
1168   } else {
1169     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(WQMOpc), Exec).addReg(Exec);
1170   }
1171 
1172   LIS->InsertMachineInstrInMaps(*MI);
1173   StateTransition[MI] = StateWQM;
1174 }
1175 
1176 void SIWholeQuadMode::toStrictMode(MachineBasicBlock &MBB,
1177                                    MachineBasicBlock::iterator Before,
1178                                    Register SaveOrig, char StrictStateNeeded) {
1179   MachineInstr *MI;
1180   assert(SaveOrig);
1181   assert(StrictStateNeeded == StateStrictWWM ||
1182          StrictStateNeeded == StateStrictWQM);
1183 
1184   if (StrictStateNeeded == StateStrictWWM) {
1185     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
1186                  SaveOrig)
1187              .addImm(-1);
1188   } else {
1189     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WQM),
1190                  SaveOrig)
1191              .addImm(-1);
1192   }
1193   LIS->InsertMachineInstrInMaps(*MI);
1194   StateTransition[MI] = StateStrictWWM;
1195 }
1196 
1197 void SIWholeQuadMode::fromStrictMode(MachineBasicBlock &MBB,
1198                                      MachineBasicBlock::iterator Before,
1199                                      Register SavedOrig, char NonStrictState,
1200                                      char CurrentStrictState) {
1201   MachineInstr *MI;
1202 
1203   assert(SavedOrig);
1204   assert(CurrentStrictState == StateStrictWWM ||
1205          CurrentStrictState == StateStrictWQM);
1206 
1207   if (CurrentStrictState == StateStrictWWM) {
1208     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM),
1209                  Exec)
1210              .addReg(SavedOrig);
1211   } else {
1212     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WQM),
1213                  Exec)
1214              .addReg(SavedOrig);
1215   }
1216   LIS->InsertMachineInstrInMaps(*MI);
1217   StateTransition[MI] = NonStrictState;
1218 }
1219 
1220 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
1221   auto BII = Blocks.find(&MBB);
1222   if (BII == Blocks.end())
1223     return;
1224 
1225   BlockInfo &BI = BII->second;
1226 
1227   // This is a non-entry block that is WQM throughout, so no need to do
1228   // anything.
1229   if (!IsEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact) {
1230     BI.InitialState = StateWQM;
1231     return;
1232   }
1233 
1234   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
1235                     << ":\n");
1236 
1237   Register SavedWQMReg;
1238   Register SavedNonStrictReg;
1239   bool WQMFromExec = IsEntry;
1240   char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
1241   char NonStrictState = 0;
1242   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
1243 
1244   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
1245   if (IsEntry) {
1246     // Skip the instruction that saves LiveMask
1247     if (II != IE && II->getOpcode() == AMDGPU::COPY)
1248       ++II;
1249   }
1250 
1251   // This stores the first instruction where it's safe to switch from WQM to
1252   // Exact or vice versa.
1253   MachineBasicBlock::iterator FirstWQM = IE;
1254 
1255   // This stores the first instruction where it's safe to switch from Strict
1256   // mode to Exact/WQM or to switch to Strict mode. It must always be the same
1257   // as, or after, FirstWQM since if it's safe to switch to/from Strict, it must
1258   // be safe to switch to/from WQM as well.
1259   MachineBasicBlock::iterator FirstStrict = IE;
1260 
1261   // Record initial state is block information.
1262   BI.InitialState = State;
1263 
1264   for (;;) {
1265     MachineBasicBlock::iterator Next = II;
1266     char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
1267     char OutNeeds = 0;
1268 
1269     if (FirstWQM == IE)
1270       FirstWQM = II;
1271 
1272     if (FirstStrict == IE)
1273       FirstStrict = II;
1274 
1275     // First, figure out the allowed states (Needs) based on the propagated
1276     // flags.
1277     if (II != IE) {
1278       MachineInstr &MI = *II;
1279 
1280       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
1281         auto III = Instructions.find(&MI);
1282         if (III != Instructions.end()) {
1283           if (III->second.Needs & StateStrictWWM)
1284             Needs = StateStrictWWM;
1285           else if (III->second.Needs & StateStrictWQM)
1286             Needs = StateStrictWQM;
1287           else if (III->second.Needs & StateWQM)
1288             Needs = StateWQM;
1289           else
1290             Needs &= ~III->second.Disabled;
1291           OutNeeds = III->second.OutNeeds;
1292         }
1293       } else {
1294         // If the instruction doesn't actually need a correct EXEC, then we can
1295         // safely leave Strict mode enabled.
1296         Needs = StateExact | StateWQM | StateStrict;
1297       }
1298 
1299       if (MI.isTerminator() && OutNeeds == StateExact)
1300         Needs = StateExact;
1301 
1302       ++Next;
1303     } else {
1304       // End of basic block
1305       if (BI.OutNeeds & StateWQM)
1306         Needs = StateWQM;
1307       else if (BI.OutNeeds == StateExact)
1308         Needs = StateExact;
1309       else
1310         Needs = StateWQM | StateExact;
1311     }
1312 
1313     // Now, transition if necessary.
1314     if (!(Needs & State)) {
1315       MachineBasicBlock::iterator First;
1316       if (State == StateStrictWWM || Needs == StateStrictWWM ||
1317           State == StateStrictWQM || Needs == StateStrictWQM) {
1318         // We must switch to or from Strict mode.
1319         First = FirstStrict;
1320       } else {
1321         // We only need to switch to/from WQM, so we can use FirstWQM.
1322         First = FirstWQM;
1323       }
1324 
1325       // Whether we need to save SCC depends on start and end states.
1326       bool SaveSCC = false;
1327       switch (State) {
1328       case StateExact:
1329       case StateStrictWWM:
1330       case StateStrictWQM:
1331         // Exact/Strict -> Strict: save SCC
1332         // Exact/Strict -> WQM: save SCC if WQM mask is generated from exec
1333         // Exact/Strict -> Exact: no save
1334         SaveSCC = (Needs & StateStrict) || ((Needs & StateWQM) && WQMFromExec);
1335         break;
1336       case StateWQM:
1337         // WQM -> Exact/Strict: save SCC
1338         SaveSCC = !(Needs & StateWQM);
1339         break;
1340       default:
1341         llvm_unreachable("Unknown state");
1342         break;
1343       }
1344       MachineBasicBlock::iterator Before =
1345           prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
1346 
1347       if (State & StateStrict) {
1348         assert(State == StateStrictWWM || State == StateStrictWQM);
1349         assert(SavedNonStrictReg);
1350         fromStrictMode(MBB, Before, SavedNonStrictReg, NonStrictState, State);
1351 
1352         LIS->createAndComputeVirtRegInterval(SavedNonStrictReg);
1353         SavedNonStrictReg = 0;
1354         State = NonStrictState;
1355       }
1356 
1357       if (Needs & StateStrict) {
1358         NonStrictState = State;
1359         assert(Needs == StateStrictWWM || Needs == StateStrictWQM);
1360         assert(!SavedNonStrictReg);
1361         SavedNonStrictReg = MRI->createVirtualRegister(BoolRC);
1362 
1363         toStrictMode(MBB, Before, SavedNonStrictReg, Needs);
1364         State = Needs;
1365 
1366       } else {
1367         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
1368           if (!WQMFromExec && (OutNeeds & StateWQM)) {
1369             assert(!SavedWQMReg);
1370             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
1371           }
1372 
1373           toExact(MBB, Before, SavedWQMReg);
1374           State = StateExact;
1375         } else if (State == StateExact && (Needs & StateWQM) &&
1376                    !(Needs & StateExact)) {
1377           assert(WQMFromExec == (SavedWQMReg == 0));
1378 
1379           toWQM(MBB, Before, SavedWQMReg);
1380 
1381           if (SavedWQMReg) {
1382             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
1383             SavedWQMReg = 0;
1384           }
1385           State = StateWQM;
1386         } else {
1387           // We can get here if we transitioned from StrictWWM to a
1388           // non-StrictWWM state that already matches our needs, but we
1389           // shouldn't need to do anything.
1390           assert(Needs & State);
1391         }
1392       }
1393     }
1394 
1395     if (Needs != (StateExact | StateWQM | StateStrict)) {
1396       if (Needs != (StateExact | StateWQM))
1397         FirstWQM = IE;
1398       FirstStrict = IE;
1399     }
1400 
1401     if (II == IE)
1402       break;
1403 
1404     II = Next;
1405   }
1406   assert(!SavedWQMReg);
1407   assert(!SavedNonStrictReg);
1408 }
1409 
1410 void SIWholeQuadMode::lowerLiveMaskQueries() {
1411   for (MachineInstr *MI : LiveMaskQueries) {
1412     const DebugLoc &DL = MI->getDebugLoc();
1413     Register Dest = MI->getOperand(0).getReg();
1414 
1415     MachineInstr *Copy =
1416         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
1417             .addReg(LiveMaskReg);
1418 
1419     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
1420     MI->eraseFromParent();
1421   }
1422 }
1423 
1424 void SIWholeQuadMode::lowerCopyInstrs() {
1425   for (MachineInstr *MI : LowerToMovInstrs) {
1426     assert(MI->getNumExplicitOperands() == 2);
1427 
1428     const Register Reg = MI->getOperand(0).getReg();
1429     const unsigned SubReg = MI->getOperand(0).getSubReg();
1430 
1431     if (TRI->isVGPR(*MRI, Reg)) {
1432       const TargetRegisterClass *regClass =
1433           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
1434       if (SubReg)
1435         regClass = TRI->getSubRegClass(regClass, SubReg);
1436 
1437       const unsigned MovOp = TII->getMovOpcode(regClass);
1438       MI->setDesc(TII->get(MovOp));
1439 
1440       // Check that it already implicitly depends on exec (like all VALU movs
1441       // should do).
1442       assert(any_of(MI->implicit_operands(), [](const MachineOperand &MO) {
1443         return MO.isUse() && MO.getReg() == AMDGPU::EXEC;
1444       }));
1445     } else {
1446       // Remove early-clobber and exec dependency from simple SGPR copies.
1447       // This allows some to be eliminated during/post RA.
1448       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
1449       if (MI->getOperand(0).isEarlyClobber()) {
1450         LIS->removeInterval(Reg);
1451         MI->getOperand(0).setIsEarlyClobber(false);
1452         LIS->createAndComputeVirtRegInterval(Reg);
1453       }
1454       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1455       while (Index >= 0) {
1456         MI->RemoveOperand(Index);
1457         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
1458       }
1459       MI->setDesc(TII->get(AMDGPU::COPY));
1460       LLVM_DEBUG(dbgs() << "  -> " << *MI);
1461     }
1462   }
1463   for (MachineInstr *MI : LowerToCopyInstrs) {
1464     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
1465         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
1466       assert(MI->getNumExplicitOperands() == 3);
1467       // the only reason we should be here is V_SET_INACTIVE has
1468       // an undef input so it is being replaced by a simple copy.
1469       // There should be a second undef source that we should remove.
1470       assert(MI->getOperand(2).isUndef());
1471       MI->RemoveOperand(2);
1472       MI->untieRegOperand(1);
1473     } else {
1474       assert(MI->getNumExplicitOperands() == 2);
1475     }
1476 
1477     MI->setDesc(TII->get(AMDGPU::COPY));
1478   }
1479 }
1480 
1481 void SIWholeQuadMode::lowerKillInstrs(bool IsWQM) {
1482   for (MachineInstr *MI : KillInstrs) {
1483     MachineBasicBlock *MBB = MI->getParent();
1484     MachineInstr *SplitPoint = nullptr;
1485     switch (MI->getOpcode()) {
1486     case AMDGPU::SI_DEMOTE_I1:
1487     case AMDGPU::SI_KILL_I1_TERMINATOR:
1488       SplitPoint = lowerKillI1(*MBB, *MI, IsWQM);
1489       break;
1490     case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
1491       SplitPoint = lowerKillF32(*MBB, *MI);
1492       break;
1493     default:
1494       continue;
1495     }
1496     if (SplitPoint)
1497       splitBlock(MBB, SplitPoint);
1498   }
1499 }
1500 
1501 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
1502   LLVM_DEBUG(dbgs() << "SI Whole Quad Mode on " << MF.getName()
1503                     << " ------------- \n");
1504   LLVM_DEBUG(MF.dump(););
1505 
1506   Instructions.clear();
1507   Blocks.clear();
1508   LiveMaskQueries.clear();
1509   LowerToCopyInstrs.clear();
1510   LowerToMovInstrs.clear();
1511   KillInstrs.clear();
1512   StateTransition.clear();
1513 
1514   ST = &MF.getSubtarget<GCNSubtarget>();
1515 
1516   TII = ST->getInstrInfo();
1517   TRI = &TII->getRegisterInfo();
1518   MRI = &MF.getRegInfo();
1519   LIS = &getAnalysis<LiveIntervals>();
1520   MDT = &getAnalysis<MachineDominatorTree>();
1521   PDT = &getAnalysis<MachinePostDominatorTree>();
1522 
1523   if (ST->isWave32()) {
1524     AndOpc = AMDGPU::S_AND_B32;
1525     AndN2Opc = AMDGPU::S_ANDN2_B32;
1526     XorOpc = AMDGPU::S_XOR_B32;
1527     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
1528     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
1529     WQMOpc = AMDGPU::S_WQM_B32;
1530     Exec = AMDGPU::EXEC_LO;
1531   } else {
1532     AndOpc = AMDGPU::S_AND_B64;
1533     AndN2Opc = AMDGPU::S_ANDN2_B64;
1534     XorOpc = AMDGPU::S_XOR_B64;
1535     AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
1536     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
1537     WQMOpc = AMDGPU::S_WQM_B64;
1538     Exec = AMDGPU::EXEC;
1539   }
1540 
1541   const char GlobalFlags = analyzeFunction(MF);
1542   const bool NeedsLiveMask = !(KillInstrs.empty() && LiveMaskQueries.empty());
1543 
1544   LiveMaskReg = Exec;
1545 
1546   // Shader is simple does not need any state changes or any complex lowering
1547   if (!(GlobalFlags & (StateWQM | StateStrict)) && LowerToCopyInstrs.empty() &&
1548       LowerToMovInstrs.empty() && KillInstrs.empty()) {
1549     lowerLiveMaskQueries();
1550     return !LiveMaskQueries.empty();
1551   }
1552 
1553   MachineBasicBlock &Entry = MF.front();
1554   MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1555 
1556   // Store a copy of the original live mask when required
1557   if (NeedsLiveMask || (GlobalFlags & StateWQM)) {
1558     LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1559     MachineInstr *MI =
1560         BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::COPY), LiveMaskReg)
1561             .addReg(Exec);
1562     LIS->InsertMachineInstrInMaps(*MI);
1563   }
1564 
1565   LLVM_DEBUG(printInfo());
1566 
1567   lowerLiveMaskQueries();
1568   lowerCopyInstrs();
1569 
1570   // Shader only needs WQM
1571   if (GlobalFlags == StateWQM) {
1572     auto MI = BuildMI(Entry, EntryMI, DebugLoc(), TII->get(WQMOpc), Exec)
1573                   .addReg(Exec);
1574     LIS->InsertMachineInstrInMaps(*MI);
1575     lowerKillInstrs(true);
1576   } else {
1577     for (auto BII : Blocks)
1578       processBlock(*BII.first, BII.first == &Entry);
1579     // Lowering blocks causes block splitting so perform as a second pass.
1580     for (auto BII : Blocks)
1581       lowerBlock(*BII.first);
1582   }
1583 
1584   // Compute live range for live mask
1585   if (LiveMaskReg != Exec)
1586     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1587 
1588   // Physical registers like SCC aren't tracked by default anyway, so just
1589   // removing the ranges we computed is the simplest option for maintaining
1590   // the analysis results.
1591   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1592 
1593   // If we performed any kills then recompute EXEC
1594   if (!KillInstrs.empty())
1595     LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::EXEC, TRI));
1596 
1597   return true;
1598 }
1599