1 //===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass adds instructions to enable whole quad mode for pixel
11 /// shaders, and whole wavefront mode for all programs.
12 ///
13 /// Whole quad mode is required for derivative computations, but it interferes
14 /// with shader side effects (stores and atomics). This pass is run on the
15 /// scheduled machine IR but before register coalescing, so that machine SSA is
16 /// available for analysis. It ensures that WQM is enabled when necessary, but
17 /// disabled around stores and atomics.
18 ///
19 /// When necessary, this pass creates a function prolog
20 ///
21 ///   S_MOV_B64 LiveMask, EXEC
22 ///   S_WQM_B64 EXEC, EXEC
23 ///
24 /// to enter WQM at the top of the function and surrounds blocks of Exact
25 /// instructions by
26 ///
27 ///   S_AND_SAVEEXEC_B64 Tmp, LiveMask
28 ///   ...
29 ///   S_MOV_B64 EXEC, Tmp
30 ///
31 /// We also compute when a sequence of instructions requires Whole Wavefront
32 /// Mode (WWM) and insert instructions to save and restore it:
33 ///
34 /// S_OR_SAVEEXEC_B64 Tmp, -1
35 /// ...
36 /// S_MOV_B64 EXEC, Tmp
37 ///
38 /// In order to avoid excessive switching during sequences of Exact
39 /// instructions, the pass first analyzes which instructions must be run in WQM
40 /// (aka which instructions produce values that lead to derivative
41 /// computations).
42 ///
43 /// Basic blocks are always exited in WQM as long as some successor needs WQM.
44 ///
45 /// There is room for improvement given better control flow analysis:
46 ///
47 ///  (1) at the top level (outside of control flow statements, and as long as
48 ///      kill hasn't been used), one SGPR can be saved by recovering WQM from
49 ///      the LiveMask (this is implemented for the entry block).
50 ///
51 ///  (2) when entire regions (e.g. if-else blocks or entire loops) only
52 ///      consist of exact and don't-care instructions, the switch only has to
53 ///      be done at the entry and exit points rather than potentially in each
54 ///      block of the region.
55 ///
56 //===----------------------------------------------------------------------===//
57 
58 #include "AMDGPU.h"
59 #include "GCNSubtarget.h"
60 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
61 #include "llvm/ADT/MapVector.h"
62 #include "llvm/ADT/PostOrderIterator.h"
63 #include "llvm/CodeGen/LiveIntervals.h"
64 #include "llvm/CodeGen/MachineBasicBlock.h"
65 #include "llvm/CodeGen/MachineFunctionPass.h"
66 #include "llvm/CodeGen/MachineInstr.h"
67 #include "llvm/IR/CallingConv.h"
68 #include "llvm/InitializePasses.h"
69 #include "llvm/Support/raw_ostream.h"
70 
71 using namespace llvm;
72 
73 #define DEBUG_TYPE "si-wqm"
74 
75 namespace {
76 
77 enum {
78   StateWQM = 0x1,
79   StateWWM = 0x2,
80   StateExact = 0x4,
81 };
82 
83 struct PrintState {
84 public:
85   int State;
86 
87   explicit PrintState(int State) : State(State) {}
88 };
89 
90 #ifndef NDEBUG
91 static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
92   if (PS.State & StateWQM)
93     OS << "WQM";
94   if (PS.State & StateWWM) {
95     if (PS.State & StateWQM)
96       OS << '|';
97     OS << "WWM";
98   }
99   if (PS.State & StateExact) {
100     if (PS.State & (StateWQM | StateWWM))
101       OS << '|';
102     OS << "Exact";
103   }
104 
105   return OS;
106 }
107 #endif
108 
109 struct InstrInfo {
110   char Needs = 0;
111   char Disabled = 0;
112   char OutNeeds = 0;
113 };
114 
115 struct BlockInfo {
116   char Needs = 0;
117   char InNeeds = 0;
118   char OutNeeds = 0;
119 };
120 
121 struct WorkItem {
122   MachineBasicBlock *MBB = nullptr;
123   MachineInstr *MI = nullptr;
124 
125   WorkItem() = default;
126   WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
127   WorkItem(MachineInstr *MI) : MI(MI) {}
128 };
129 
130 class SIWholeQuadMode : public MachineFunctionPass {
131 private:
132   CallingConv::ID CallingConv;
133   const SIInstrInfo *TII;
134   const SIRegisterInfo *TRI;
135   const GCNSubtarget *ST;
136   MachineRegisterInfo *MRI;
137   LiveIntervals *LIS;
138 
139   unsigned AndOpc;
140   unsigned XorTermrOpc;
141   unsigned OrSaveExecOpc;
142   unsigned Exec;
143 
144   DenseMap<const MachineInstr *, InstrInfo> Instructions;
145   MapVector<MachineBasicBlock *, BlockInfo> Blocks;
146   SmallVector<MachineInstr *, 1> LiveMaskQueries;
147   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
148   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
149 
150   void printInfo();
151 
152   void markInstruction(MachineInstr &MI, char Flag,
153                        std::vector<WorkItem> &Worklist);
154   void markDefs(const MachineInstr &UseMI, LiveRange &LR, Register Reg,
155                 unsigned SubReg, char Flag, std::vector<WorkItem> &Worklist);
156   void markInstructionUses(const MachineInstr &MI, char Flag,
157                            std::vector<WorkItem> &Worklist);
158   char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
159   void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
160   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
161   char analyzeFunction(MachineFunction &MF);
162 
163   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
164                                       MachineBasicBlock::iterator Before);
165   MachineBasicBlock::iterator
166   prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
167                    MachineBasicBlock::iterator Last, bool PreferLast,
168                    bool SaveSCC);
169   void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
170                unsigned SaveWQM, unsigned LiveMaskReg);
171   void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
172              unsigned SavedWQM);
173   void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
174              unsigned SaveOrig);
175   void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
176                unsigned SavedOrig);
177   void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);
178 
179   void lowerLiveMaskQueries(unsigned LiveMaskReg);
180   void lowerCopyInstrs();
181 
182 public:
183   static char ID;
184 
185   SIWholeQuadMode() :
186     MachineFunctionPass(ID) { }
187 
188   bool runOnMachineFunction(MachineFunction &MF) override;
189 
190   StringRef getPassName() const override { return "SI Whole Quad Mode"; }
191 
192   void getAnalysisUsage(AnalysisUsage &AU) const override {
193     AU.addRequired<LiveIntervals>();
194     AU.addPreserved<SlotIndexes>();
195     AU.addPreserved<LiveIntervals>();
196     AU.setPreservesCFG();
197     MachineFunctionPass::getAnalysisUsage(AU);
198   }
199 };
200 
201 } // end anonymous namespace
202 
203 char SIWholeQuadMode::ID = 0;
204 
205 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
206                       false)
207 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
208 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
209                     false)
210 
211 char &llvm::SIWholeQuadModeID = SIWholeQuadMode::ID;
212 
213 FunctionPass *llvm::createSIWholeQuadModePass() {
214   return new SIWholeQuadMode;
215 }
216 
217 #ifndef NDEBUG
218 LLVM_DUMP_METHOD void SIWholeQuadMode::printInfo() {
219   for (const auto &BII : Blocks) {
220     dbgs() << "\n"
221            << printMBBReference(*BII.first) << ":\n"
222            << "  InNeeds = " << PrintState(BII.second.InNeeds)
223            << ", Needs = " << PrintState(BII.second.Needs)
224            << ", OutNeeds = " << PrintState(BII.second.OutNeeds) << "\n\n";
225 
226     for (const MachineInstr &MI : *BII.first) {
227       auto III = Instructions.find(&MI);
228       if (III == Instructions.end())
229         continue;
230 
231       dbgs() << "  " << MI << "    Needs = " << PrintState(III->second.Needs)
232              << ", OutNeeds = " << PrintState(III->second.OutNeeds) << '\n';
233     }
234   }
235 }
236 #endif
237 
238 void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
239                                       std::vector<WorkItem> &Worklist) {
240   InstrInfo &II = Instructions[&MI];
241 
242   assert(!(Flag & StateExact) && Flag != 0);
243 
244   LLVM_DEBUG(dbgs() << "markInstruction " << PrintState(Flag) << ": " << MI);
245 
246   // Remove any disabled states from the flag. The user that required it gets
247   // an undefined value in the helper lanes. For example, this can happen if
248   // the result of an atomic is used by instruction that requires WQM, where
249   // ignoring the request for WQM is correct as per the relevant specs.
250   Flag &= ~II.Disabled;
251 
252   // Ignore if the flag is already encompassed by the existing needs, or we
253   // just disabled everything.
254   if ((II.Needs & Flag) == Flag)
255     return;
256 
257   II.Needs |= Flag;
258   Worklist.push_back(&MI);
259 }
260 
261 /// Mark all relevant definitions of register \p Reg in usage \p UseMI.
262 void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
263                                Register Reg, unsigned SubReg, char Flag,
264                                std::vector<WorkItem> &Worklist) {
265   assert(!MRI->isSSA());
266 
267   LLVM_DEBUG(dbgs() << "markDefs " << PrintState(Flag) << ": " << UseMI);
268 
269   LiveQueryResult UseLRQ = LR.Query(LIS->getInstructionIndex(UseMI));
270   if (!UseLRQ.valueIn())
271     return;
272 
273   SmallPtrSet<const VNInfo *, 4> Visited;
274   SmallVector<const VNInfo *, 4> ToProcess;
275   ToProcess.push_back(UseLRQ.valueIn());
276   do {
277     const VNInfo *Value = ToProcess.pop_back_val();
278     Visited.insert(Value);
279 
280     if (Value->isPHIDef()) {
281       // Need to mark all defs used in the PHI node
282       const MachineBasicBlock *MBB = LIS->getMBBFromIndex(Value->def);
283       assert(MBB && "Phi-def has no defining MBB");
284       for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
285                                                   PE = MBB->pred_end();
286            PI != PE; ++PI) {
287         if (const VNInfo *VN = LR.getVNInfoBefore(LIS->getMBBEndIdx(*PI))) {
288           if (!Visited.count(VN))
289             ToProcess.push_back(VN);
290         }
291       }
292     } else {
293       MachineInstr *MI = LIS->getInstructionFromIndex(Value->def);
294       assert(MI && "Def has no defining instruction");
295       markInstruction(*MI, Flag, Worklist);
296 
297       // Iterate over all operands to find relevant definitions
298       for (const MachineOperand &Op : MI->operands()) {
299         if (!(Op.isReg() && Op.getReg() == Reg))
300           continue;
301 
302         // Does this def cover whole register?
303         bool DefinesFullReg =
304             Op.isUndef() || !Op.getSubReg() || Op.getSubReg() == SubReg;
305         if (!DefinesFullReg) {
306           // Partial definition; need to follow and mark input value
307           LiveQueryResult LRQ = LR.Query(LIS->getInstructionIndex(*MI));
308           if (const VNInfo *VN = LRQ.valueIn()) {
309             if (!Visited.count(VN))
310               ToProcess.push_back(VN);
311           }
312         }
313       }
314     }
315   } while (!ToProcess.empty());
316 }
317 
318 /// Mark all instructions defining the uses in \p MI with \p Flag.
319 void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
320                                           std::vector<WorkItem> &Worklist) {
321 
322   LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
323                     << MI);
324 
325   for (const MachineOperand &Use : MI.uses()) {
326     if (!Use.isReg() || !Use.isUse())
327       continue;
328 
329     Register Reg = Use.getReg();
330 
331     // Handle physical registers that we need to track; this is mostly relevant
332     // for VCC, which can appear as the (implicit) input of a uniform branch,
333     // e.g. when a loop counter is stored in a VGPR.
334     if (!Reg.isVirtual()) {
335       if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
336         continue;
337 
338       for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
339            ++RegUnit) {
340         LiveRange &LR = LIS->getRegUnit(*RegUnit);
341         const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
342         if (!Value)
343           continue;
344 
345         if (MRI->isSSA()) {
346           // Since we're in machine SSA, we do not need to track physical
347           // registers across basic blocks.
348           if (Value->isPHIDef())
349             continue;
350           markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
351                           Worklist);
352         } else {
353           markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
354         }
355       }
356 
357       continue;
358     }
359 
360     if (MRI->isSSA()) {
361       for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
362         markInstruction(DefMI, Flag, Worklist);
363     } else {
364       LiveRange &LR = LIS->getInterval(Reg);
365       markDefs(MI, LR, Reg, Use.getSubReg(), Flag, Worklist);
366     }
367   }
368 }
369 
370 // Scan instructions to determine which ones require an Exact execmask and
371 // which ones seed WQM requirements.
372 char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
373                                        std::vector<WorkItem> &Worklist) {
374   char GlobalFlags = 0;
375   bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
376   SmallVector<MachineInstr *, 4> SetInactiveInstrs;
377   SmallVector<MachineInstr *, 4> SoftWQMInstrs;
378 
379   // We need to visit the basic blocks in reverse post-order so that we visit
380   // defs before uses, in particular so that we don't accidentally mark an
381   // instruction as needing e.g. WQM before visiting it and realizing it needs
382   // WQM disabled.
383   ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
384   for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
385     MachineBasicBlock &MBB = **BI;
386     BlockInfo &BBI = Blocks[&MBB];
387 
388     for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
389       MachineInstr &MI = *II;
390       InstrInfo &III = Instructions[&MI];
391       unsigned Opcode = MI.getOpcode();
392       char Flags = 0;
393 
394       if (TII->isWQM(Opcode)) {
395         // Sampling instructions don't need to produce results for all pixels
396         // in a quad, they just require all inputs of a quad to have been
397         // computed for derivatives.
398         markInstructionUses(MI, StateWQM, Worklist);
399         GlobalFlags |= StateWQM;
400         continue;
401       } else if (Opcode == AMDGPU::WQM) {
402         // The WQM intrinsic requires its output to have all the helper lanes
403         // correct, so we need it to be in WQM.
404         Flags = StateWQM;
405         LowerToCopyInstrs.push_back(&MI);
406       } else if (Opcode == AMDGPU::SOFT_WQM) {
407         LowerToCopyInstrs.push_back(&MI);
408         SoftWQMInstrs.push_back(&MI);
409         continue;
410       } else if (Opcode == AMDGPU::WWM) {
411         // The WWM intrinsic doesn't make the same guarantee, and plus it needs
412         // to be executed in WQM or Exact so that its copy doesn't clobber
413         // inactive lanes.
414         markInstructionUses(MI, StateWWM, Worklist);
415         GlobalFlags |= StateWWM;
416         LowerToMovInstrs.push_back(&MI);
417         continue;
418       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
419                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
420         III.Disabled = StateWWM;
421         MachineOperand &Inactive = MI.getOperand(2);
422         if (Inactive.isReg()) {
423           if (Inactive.isUndef()) {
424             LowerToCopyInstrs.push_back(&MI);
425           } else {
426             Register Reg = Inactive.getReg();
427             if (Reg.isVirtual()) {
428               for (MachineInstr &DefMI : MRI->def_instructions(Reg))
429                 markInstruction(DefMI, StateWWM, Worklist);
430             }
431           }
432         }
433         SetInactiveInstrs.push_back(&MI);
434         continue;
435       } else if (TII->isDisableWQM(MI)) {
436         BBI.Needs |= StateExact;
437         if (!(BBI.InNeeds & StateExact)) {
438           BBI.InNeeds |= StateExact;
439           Worklist.push_back(&MBB);
440         }
441         GlobalFlags |= StateExact;
442         III.Disabled = StateWQM | StateWWM;
443         continue;
444       } else {
445         if (Opcode == AMDGPU::SI_PS_LIVE) {
446           LiveMaskQueries.push_back(&MI);
447         } else if (WQMOutputs) {
448           // The function is in machine SSA form, which means that physical
449           // VGPRs correspond to shader inputs and outputs. Inputs are
450           // only used, outputs are only defined.
451           for (const MachineOperand &MO : MI.defs()) {
452             if (!MO.isReg())
453               continue;
454 
455             Register Reg = MO.getReg();
456 
457             if (!Reg.isVirtual() &&
458                 TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
459               Flags = StateWQM;
460               break;
461             }
462           }
463         }
464 
465         if (!Flags)
466           continue;
467       }
468 
469       markInstruction(MI, Flags, Worklist);
470       GlobalFlags |= Flags;
471     }
472   }
473 
474   // Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
475   // ever used anywhere in the function. This implements the corresponding
476   // semantics of @llvm.amdgcn.set.inactive.
477   // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
478   if (GlobalFlags & StateWQM) {
479     for (MachineInstr *MI : SetInactiveInstrs)
480       markInstruction(*MI, StateWQM, Worklist);
481     for (MachineInstr *MI : SoftWQMInstrs)
482       markInstruction(*MI, StateWQM, Worklist);
483   }
484 
485   return GlobalFlags;
486 }
487 
488 void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
489                                            std::vector<WorkItem>& Worklist) {
490   MachineBasicBlock *MBB = MI.getParent();
491   InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
492   BlockInfo &BI = Blocks[MBB];
493 
494   // Control flow-type instructions and stores to temporary memory that are
495   // followed by WQM computations must themselves be in WQM.
496   if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
497       (MI.isTerminator() || (TII->usesVM_CNT(MI) && MI.mayStore()))) {
498     Instructions[&MI].Needs = StateWQM;
499     II.Needs = StateWQM;
500   }
501 
502   // Propagate to block level
503   if (II.Needs & StateWQM) {
504     BI.Needs |= StateWQM;
505     if (!(BI.InNeeds & StateWQM)) {
506       BI.InNeeds |= StateWQM;
507       Worklist.push_back(MBB);
508     }
509   }
510 
511   // Propagate backwards within block
512   if (MachineInstr *PrevMI = MI.getPrevNode()) {
513     char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
514     if (!PrevMI->isPHI()) {
515       InstrInfo &PrevII = Instructions[PrevMI];
516       if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
517         PrevII.OutNeeds |= InNeeds;
518         Worklist.push_back(PrevMI);
519       }
520     }
521   }
522 
523   // Propagate WQM flag to instruction inputs
524   assert(!(II.Needs & StateExact));
525 
526   if (II.Needs != 0)
527     markInstructionUses(MI, II.Needs, Worklist);
528 
529   // Ensure we process a block containing WWM, even if it does not require any
530   // WQM transitions.
531   if (II.Needs & StateWWM)
532     BI.Needs |= StateWWM;
533 }
534 
535 void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
536                                      std::vector<WorkItem>& Worklist) {
537   BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.
538 
539   // Propagate through instructions
540   if (!MBB.empty()) {
541     MachineInstr *LastMI = &*MBB.rbegin();
542     InstrInfo &LastII = Instructions[LastMI];
543     if ((LastII.OutNeeds | BI.OutNeeds) != LastII.OutNeeds) {
544       LastII.OutNeeds |= BI.OutNeeds;
545       Worklist.push_back(LastMI);
546     }
547   }
548 
549   // Predecessor blocks must provide for our WQM/Exact needs.
550   for (MachineBasicBlock *Pred : MBB.predecessors()) {
551     BlockInfo &PredBI = Blocks[Pred];
552     if ((PredBI.OutNeeds | BI.InNeeds) == PredBI.OutNeeds)
553       continue;
554 
555     PredBI.OutNeeds |= BI.InNeeds;
556     PredBI.InNeeds |= BI.InNeeds;
557     Worklist.push_back(Pred);
558   }
559 
560   // All successors must be prepared to accept the same set of WQM/Exact data.
561   for (MachineBasicBlock *Succ : MBB.successors()) {
562     BlockInfo &SuccBI = Blocks[Succ];
563     if ((SuccBI.InNeeds | BI.OutNeeds) == SuccBI.InNeeds)
564       continue;
565 
566     SuccBI.InNeeds |= BI.OutNeeds;
567     Worklist.push_back(Succ);
568   }
569 }
570 
571 char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
572   std::vector<WorkItem> Worklist;
573   char GlobalFlags = scanInstructions(MF, Worklist);
574 
575   while (!Worklist.empty()) {
576     WorkItem WI = Worklist.back();
577     Worklist.pop_back();
578 
579     if (WI.MI)
580       propagateInstruction(*WI.MI, Worklist);
581     else
582       propagateBlock(*WI.MBB, Worklist);
583   }
584 
585   return GlobalFlags;
586 }
587 
588 MachineBasicBlock::iterator
589 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
590                          MachineBasicBlock::iterator Before) {
591   Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
592 
593   MachineInstr *Save =
594       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
595           .addReg(AMDGPU::SCC);
596   MachineInstr *Restore =
597       BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), AMDGPU::SCC)
598           .addReg(SaveReg);
599 
600   LIS->InsertMachineInstrInMaps(*Save);
601   LIS->InsertMachineInstrInMaps(*Restore);
602   LIS->createAndComputeVirtRegInterval(SaveReg);
603 
604   return Restore;
605 }
606 
607 // Return an iterator in the (inclusive) range [First, Last] at which
608 // instructions can be safely inserted, keeping in mind that some of the
609 // instructions we want to add necessarily clobber SCC.
610 MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
611     MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
612     MachineBasicBlock::iterator Last, bool PreferLast, bool SaveSCC) {
613   if (!SaveSCC)
614     return PreferLast ? Last : First;
615 
616   LiveRange &LR =
617       LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
618   auto MBBE = MBB.end();
619   SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
620                                      : LIS->getMBBEndIdx(&MBB);
621   SlotIndex LastIdx =
622       Last != MBBE ? LIS->getInstructionIndex(*Last) : LIS->getMBBEndIdx(&MBB);
623   SlotIndex Idx = PreferLast ? LastIdx : FirstIdx;
624   const LiveRange::Segment *S;
625 
626   for (;;) {
627     S = LR.getSegmentContaining(Idx);
628     if (!S)
629       break;
630 
631     if (PreferLast) {
632       SlotIndex Next = S->start.getBaseIndex();
633       if (Next < FirstIdx)
634         break;
635       Idx = Next;
636     } else {
637       MachineInstr *EndMI = LIS->getInstructionFromIndex(S->end.getBaseIndex());
638       assert(EndMI && "Segment does not end on valid instruction");
639       auto NextI = std::next(EndMI->getIterator());
640       if (NextI == MBB.end())
641         break;
642       SlotIndex Next = LIS->getInstructionIndex(*NextI);
643       if (Next > LastIdx)
644         break;
645       Idx = Next;
646     }
647   }
648 
649   MachineBasicBlock::iterator MBBI;
650 
651   if (MachineInstr *MI = LIS->getInstructionFromIndex(Idx))
652     MBBI = MI;
653   else {
654     assert(Idx == LIS->getMBBEndIdx(&MBB));
655     MBBI = MBB.end();
656   }
657 
658   // Move insertion point past any operations modifying EXEC.
659   // This assumes that the value of SCC defined by any of these operations
660   // does not need to be preserved.
661   while (MBBI != Last) {
662     bool IsExecDef = false;
663     for (const MachineOperand &MO : MBBI->operands()) {
664       if (MO.isReg() && MO.isDef()) {
665         IsExecDef |=
666             MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
667       }
668     }
669     if (!IsExecDef)
670       break;
671     MBBI++;
672     S = nullptr;
673   }
674 
675   if (S)
676     MBBI = saveSCC(MBB, MBBI);
677 
678   return MBBI;
679 }
680 
681 void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
682                               MachineBasicBlock::iterator Before,
683                               unsigned SaveWQM, unsigned LiveMaskReg) {
684   MachineInstr *MI;
685 
686   if (SaveWQM) {
687     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
688                    AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64),
689                  SaveWQM)
690              .addReg(LiveMaskReg);
691   } else {
692     unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
693     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
694                    AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64),
695                  Exec)
696              .addReg(Exec)
697              .addReg(LiveMaskReg);
698   }
699 
700   LIS->InsertMachineInstrInMaps(*MI);
701 }
702 
703 void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
704                             MachineBasicBlock::iterator Before,
705                             unsigned SavedWQM) {
706   MachineInstr *MI;
707 
708   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
709   if (SavedWQM) {
710     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), Exec)
711              .addReg(SavedWQM);
712   } else {
713     MI = BuildMI(MBB, Before, DebugLoc(), TII->get(ST->isWave32() ?
714                    AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
715                  Exec)
716              .addReg(Exec);
717   }
718 
719   LIS->InsertMachineInstrInMaps(*MI);
720 }
721 
722 void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
723                             MachineBasicBlock::iterator Before,
724                             unsigned SaveOrig) {
725   MachineInstr *MI;
726 
727   assert(SaveOrig);
728   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
729            .addImm(-1);
730   LIS->InsertMachineInstrInMaps(*MI);
731 }
732 
733 void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
734                               MachineBasicBlock::iterator Before,
735                               unsigned SavedOrig) {
736   MachineInstr *MI;
737 
738   assert(SavedOrig);
739   MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM),
740                ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC)
741            .addReg(SavedOrig);
742   LIS->InsertMachineInstrInMaps(*MI);
743 }
744 
745 void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
746                                    bool isEntry) {
747   auto BII = Blocks.find(&MBB);
748   if (BII == Blocks.end())
749     return;
750 
751   const BlockInfo &BI = BII->second;
752 
753   // This is a non-entry block that is WQM throughout, so no need to do
754   // anything.
755   if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
756     return;
757 
758   LLVM_DEBUG(dbgs() << "\nProcessing block " << printMBBReference(MBB)
759                     << ":\n");
760 
761   unsigned SavedWQMReg = 0;
762   unsigned SavedNonWWMReg = 0;
763   bool WQMFromExec = isEntry;
764   char State = (isEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
765   char NonWWMState = 0;
766   const TargetRegisterClass *BoolRC = TRI->getBoolRC();
767 
768   auto II = MBB.getFirstNonPHI(), IE = MBB.end();
769   if (isEntry) {
770     // Skip the instruction that saves LiveMask
771     if (II != IE && II->getOpcode() == AMDGPU::COPY)
772       ++II;
773   }
774 
775   // This stores the first instruction where it's safe to switch from WQM to
776   // Exact or vice versa.
777   MachineBasicBlock::iterator FirstWQM = IE;
778 
779   // This stores the first instruction where it's safe to switch from WWM to
780   // Exact/WQM or to switch to WWM. It must always be the same as, or after,
781   // FirstWQM since if it's safe to switch to/from WWM, it must be safe to
782   // switch to/from WQM as well.
783   MachineBasicBlock::iterator FirstWWM = IE;
784 
785   for (;;) {
786     MachineBasicBlock::iterator Next = II;
787     char Needs = StateExact | StateWQM; // WWM is disabled by default
788     char OutNeeds = 0;
789 
790     if (FirstWQM == IE)
791       FirstWQM = II;
792 
793     if (FirstWWM == IE)
794       FirstWWM = II;
795 
796     // First, figure out the allowed states (Needs) based on the propagated
797     // flags.
798     if (II != IE) {
799       MachineInstr &MI = *II;
800 
801       if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
802         auto III = Instructions.find(&MI);
803         if (III != Instructions.end()) {
804           if (III->second.Needs & StateWWM)
805             Needs = StateWWM;
806           else if (III->second.Needs & StateWQM)
807             Needs = StateWQM;
808           else
809             Needs &= ~III->second.Disabled;
810           OutNeeds = III->second.OutNeeds;
811         }
812       } else {
813         // If the instruction doesn't actually need a correct EXEC, then we can
814         // safely leave WWM enabled.
815         Needs = StateExact | StateWQM | StateWWM;
816       }
817 
818       if (MI.isTerminator() && OutNeeds == StateExact)
819         Needs = StateExact;
820 
821       ++Next;
822     } else {
823       // End of basic block
824       if (BI.OutNeeds & StateWQM)
825         Needs = StateWQM;
826       else if (BI.OutNeeds == StateExact)
827         Needs = StateExact;
828       else
829         Needs = StateWQM | StateExact;
830     }
831 
832     // Now, transition if necessary.
833     if (!(Needs & State)) {
834       MachineBasicBlock::iterator First;
835       if (State == StateWWM || Needs == StateWWM) {
836         // We must switch to or from WWM
837         First = FirstWWM;
838       } else {
839         // We only need to switch to/from WQM, so we can use FirstWQM
840         First = FirstWQM;
841       }
842 
843       MachineBasicBlock::iterator Before =
844           prepareInsertion(MBB, First, II, Needs == StateWQM,
845                            Needs == StateExact || WQMFromExec);
846 
847       if (State == StateWWM) {
848         assert(SavedNonWWMReg);
849         fromWWM(MBB, Before, SavedNonWWMReg);
850         LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
851         SavedNonWWMReg = 0;
852         State = NonWWMState;
853       }
854 
855       if (Needs == StateWWM) {
856         NonWWMState = State;
857         assert(!SavedNonWWMReg);
858         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
859         toWWM(MBB, Before, SavedNonWWMReg);
860         State = StateWWM;
861       } else {
862         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
863           if (!WQMFromExec && (OutNeeds & StateWQM)) {
864             assert(!SavedWQMReg);
865             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
866           }
867 
868           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
869           State = StateExact;
870         } else if (State == StateExact && (Needs & StateWQM) &&
871                    !(Needs & StateExact)) {
872           assert(WQMFromExec == (SavedWQMReg == 0));
873 
874           toWQM(MBB, Before, SavedWQMReg);
875 
876           if (SavedWQMReg) {
877             LIS->createAndComputeVirtRegInterval(SavedWQMReg);
878             SavedWQMReg = 0;
879           }
880           State = StateWQM;
881         } else {
882           // We can get here if we transitioned from WWM to a non-WWM state that
883           // already matches our needs, but we shouldn't need to do anything.
884           assert(Needs & State);
885         }
886       }
887     }
888 
889     if (Needs != (StateExact | StateWQM | StateWWM)) {
890       if (Needs != (StateExact | StateWQM))
891         FirstWQM = IE;
892       FirstWWM = IE;
893     }
894 
895     if (II == IE)
896       break;
897 
898     II = Next;
899   }
900   assert(!SavedWQMReg);
901   assert(!SavedNonWWMReg);
902 }
903 
904 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
905   for (MachineInstr *MI : LiveMaskQueries) {
906     const DebugLoc &DL = MI->getDebugLoc();
907     Register Dest = MI->getOperand(0).getReg();
908 
909     MachineInstr *Copy =
910         BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
911             .addReg(LiveMaskReg);
912 
913     LIS->ReplaceMachineInstrInMaps(*MI, *Copy);
914     MI->eraseFromParent();
915   }
916 }
917 
918 void SIWholeQuadMode::lowerCopyInstrs() {
919   for (MachineInstr *MI : LowerToMovInstrs) {
920     assert(MI->getNumExplicitOperands() == 2);
921 
922     const Register Reg = MI->getOperand(0).getReg();
923     const unsigned SubReg = MI->getOperand(0).getSubReg();
924 
925     if (TRI->isVGPR(*MRI, Reg)) {
926       const TargetRegisterClass *regClass =
927           Reg.isVirtual() ? MRI->getRegClass(Reg) : TRI->getPhysRegClass(Reg);
928       if (SubReg)
929         regClass = TRI->getSubRegClass(regClass, SubReg);
930 
931       const unsigned MovOp = TII->getMovOpcode(regClass);
932       MI->setDesc(TII->get(MovOp));
933 
934       // And make it implicitly depend on exec (like all VALU movs should do).
935       MI->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
936     } else if (!MRI->isSSA()) {
937       // Remove early-clobber and exec dependency from simple SGPR copies.
938       // This allows some to be eliminated during/post RA.
939       LLVM_DEBUG(dbgs() << "simplify SGPR copy: " << *MI);
940       if (MI->getOperand(0).isEarlyClobber()) {
941         LIS->removeInterval(Reg);
942         MI->getOperand(0).setIsEarlyClobber(false);
943         LIS->createAndComputeVirtRegInterval(Reg);
944       }
945       int Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
946       while (Index >= 0) {
947         MI->RemoveOperand(Index);
948         Index = MI->findRegisterUseOperandIdx(AMDGPU::EXEC);
949       }
950       MI->setDesc(TII->get(AMDGPU::COPY));
951       LLVM_DEBUG(dbgs() << "  -> " << *MI);
952     }
953   }
954   for (MachineInstr *MI : LowerToCopyInstrs) {
955     if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
956         MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
957       assert(MI->getNumExplicitOperands() == 3);
958       // the only reason we should be here is V_SET_INACTIVE has
959       // an undef input so it is being replaced by a simple copy.
960       // There should be a second undef source that we should remove.
961       assert(MI->getOperand(2).isUndef());
962       MI->RemoveOperand(2);
963       MI->untieRegOperand(1);
964     } else {
965       assert(MI->getNumExplicitOperands() == 2);
966     }
967 
968     MI->setDesc(TII->get(AMDGPU::COPY));
969   }
970 }
971 
972 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
973   Instructions.clear();
974   Blocks.clear();
975   LiveMaskQueries.clear();
976   LowerToCopyInstrs.clear();
977   LowerToMovInstrs.clear();
978   CallingConv = MF.getFunction().getCallingConv();
979 
980   ST = &MF.getSubtarget<GCNSubtarget>();
981 
982   TII = ST->getInstrInfo();
983   TRI = &TII->getRegisterInfo();
984   MRI = &MF.getRegInfo();
985   LIS = &getAnalysis<LiveIntervals>();
986 
987   if (ST->isWave32()) {
988     AndOpc = AMDGPU::S_AND_B32;
989     XorTermrOpc = AMDGPU::S_XOR_B32_term;
990     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
991     Exec = AMDGPU::EXEC_LO;
992   } else {
993     AndOpc = AMDGPU::S_AND_B64;
994     XorTermrOpc = AMDGPU::S_XOR_B64_term;
995     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
996     Exec = AMDGPU::EXEC;
997   }
998 
999   char GlobalFlags = analyzeFunction(MF);
1000   unsigned LiveMaskReg = 0;
1001   if (!(GlobalFlags & StateWQM)) {
1002     lowerLiveMaskQueries(Exec);
1003     if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
1004       return !LiveMaskQueries.empty();
1005   } else {
1006     // Store a copy of the original live mask when required
1007     MachineBasicBlock &Entry = MF.front();
1008     MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();
1009 
1010     if (GlobalFlags & StateExact || !LiveMaskQueries.empty()) {
1011       LiveMaskReg = MRI->createVirtualRegister(TRI->getBoolRC());
1012       MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
1013                                  TII->get(AMDGPU::COPY), LiveMaskReg)
1014                              .addReg(Exec);
1015       LIS->InsertMachineInstrInMaps(*MI);
1016     }
1017 
1018     lowerLiveMaskQueries(LiveMaskReg);
1019 
1020     if (GlobalFlags == StateWQM) {
1021       // For a shader that needs only WQM, we can just set it once.
1022       auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
1023                         TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
1024                                                 : AMDGPU::S_WQM_B64),
1025                         Exec)
1026                     .addReg(Exec);
1027       LIS->InsertMachineInstrInMaps(*MI);
1028 
1029       lowerCopyInstrs();
1030       // EntryMI may become invalid here
1031       return true;
1032     }
1033   }
1034 
1035   LLVM_DEBUG(printInfo());
1036 
1037   lowerCopyInstrs();
1038 
1039   // Handle the general case
1040   for (auto BII : Blocks)
1041     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
1042 
1043   if (LiveMaskReg)
1044     LIS->createAndComputeVirtRegInterval(LiveMaskReg);
1045 
1046   // Physical registers like SCC aren't tracked by default anyway, so just
1047   // removing the ranges we computed is the simplest option for maintaining
1048   // the analysis results.
1049   LIS->removeRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
1050 
1051   return true;
1052 }
1053