1 //===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass lowers the pseudo control flow instructions to real
11 /// machine instructions.
12 ///
13 /// All control flow is handled using predicated instructions and
14 /// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
15 /// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
16 /// by writing to the 64-bit EXEC register (each bit corresponds to a
17 /// single vector ALU).  Typically, for predicates, a vector ALU will write
18 /// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
19 /// Vector ALU) and then the ScalarALU will AND the VCC register with the
20 /// EXEC to update the predicates.
21 ///
22 /// For example:
23 /// %vcc = V_CMP_GT_F32 %vgpr1, %vgpr2
24 /// %sgpr0 = SI_IF %vcc
25 ///   %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0
26 /// %sgpr0 = SI_ELSE %sgpr0
27 ///   %vgpr0 = V_SUB_F32 %vgpr0, %vgpr0
28 /// SI_END_CF %sgpr0
29 ///
30 /// becomes:
31 ///
32 /// %sgpr0 = S_AND_SAVEEXEC_B64 %vcc  // Save and update the exec mask
33 /// %sgpr0 = S_XOR_B64 %sgpr0, %exec  // Clear live bits from saved exec mask
34 /// S_CBRANCH_EXECZ label0            // This instruction is an optional
35 ///                                   // optimization which allows us to
36 ///                                   // branch if all the bits of
37 ///                                   // EXEC are zero.
38 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
39 ///
40 /// label0:
41 /// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then
42 ///                                    // block
43 /// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
44 /// S_BRANCH_EXECZ label1              // Use our branch optimization
45 ///                                    // instruction again.
46 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
47 /// label1:
48 /// %exec = S_OR_B64 %exec, %sgpr0     // Re-enable saved exec mask bits
49 //===----------------------------------------------------------------------===//
50 
51 #include "AMDGPU.h"
52 #include "GCNSubtarget.h"
53 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
54 #include "llvm/ADT/SmallSet.h"
55 #include "llvm/CodeGen/LiveIntervals.h"
56 #include "llvm/CodeGen/LiveVariables.h"
57 #include "llvm/CodeGen/MachineDominators.h"
58 #include "llvm/CodeGen/MachineFunctionPass.h"
59 #include "llvm/Target/TargetMachine.h"
60 
61 using namespace llvm;
62 
63 #define DEBUG_TYPE "si-lower-control-flow"
64 
65 static cl::opt<bool>
66 RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
67     cl::init(true), cl::ReallyHidden);
68 
69 namespace {
70 
71 class SILowerControlFlow : public MachineFunctionPass {
72 private:
73   const SIRegisterInfo *TRI = nullptr;
74   const SIInstrInfo *TII = nullptr;
75   LiveIntervals *LIS = nullptr;
76   LiveVariables *LV = nullptr;
77   MachineDominatorTree *MDT = nullptr;
78   MachineRegisterInfo *MRI = nullptr;
79   SetVector<MachineInstr*> LoweredEndCf;
80   DenseSet<Register> LoweredIf;
81   SmallSet<MachineBasicBlock *, 4> KillBlocks;
82   SmallSet<Register, 8> RecomputeRegs;
83 
84   const TargetRegisterClass *BoolRC = nullptr;
85   unsigned AndOpc;
86   unsigned OrOpc;
87   unsigned XorOpc;
88   unsigned MovTermOpc;
89   unsigned Andn2TermOpc;
90   unsigned XorTermrOpc;
91   unsigned OrTermrOpc;
92   unsigned OrSaveExecOpc;
93   unsigned Exec;
94 
95   bool EnableOptimizeEndCf = false;
96 
97   bool hasKill(const MachineBasicBlock *Begin, const MachineBasicBlock *End);
98 
99   void emitIf(MachineInstr &MI);
100   void emitElse(MachineInstr &MI);
101   void emitIfBreak(MachineInstr &MI);
102   void emitLoop(MachineInstr &MI);
103 
104   MachineBasicBlock *emitEndCf(MachineInstr &MI);
105 
106   void lowerInitExec(MachineBasicBlock *MBB, MachineInstr &MI);
107 
108   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
109                         SmallVectorImpl<MachineOperand> &Src) const;
110 
111   void combineMasks(MachineInstr &MI);
112 
113   bool removeMBBifRedundant(MachineBasicBlock &MBB);
114 
115   MachineBasicBlock *process(MachineInstr &MI);
116 
117   // Skip to the next instruction, ignoring debug instructions, and trivial
118   // block boundaries (blocks that have one (typically fallthrough) successor,
119   // and the successor has one predecessor.
120   MachineBasicBlock::iterator
121   skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
122                                  MachineBasicBlock::iterator It) const;
123 
124   /// Find the insertion point for a new conditional branch.
125   MachineBasicBlock::iterator
skipToUncondBrOrEnd(MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const126   skipToUncondBrOrEnd(MachineBasicBlock &MBB,
127                       MachineBasicBlock::iterator I) const {
128     assert(I->isTerminator());
129 
130     // FIXME: What if we had multiple pre-existing conditional branches?
131     MachineBasicBlock::iterator End = MBB.end();
132     while (I != End && !I->isUnconditionalBranch())
133       ++I;
134     return I;
135   }
136 
137   // Remove redundant SI_END_CF instructions.
138   void optimizeEndCf();
139 
140 public:
141   static char ID;
142 
SILowerControlFlow()143   SILowerControlFlow() : MachineFunctionPass(ID) {}
144 
145   bool runOnMachineFunction(MachineFunction &MF) override;
146 
getPassName() const147   StringRef getPassName() const override {
148     return "SI Lower control flow pseudo instructions";
149   }
150 
getAnalysisUsage(AnalysisUsage & AU) const151   void getAnalysisUsage(AnalysisUsage &AU) const override {
152     AU.addUsedIfAvailable<LiveIntervals>();
153     // Should preserve the same set that TwoAddressInstructions does.
154     AU.addPreserved<MachineDominatorTree>();
155     AU.addPreserved<SlotIndexes>();
156     AU.addPreserved<LiveIntervals>();
157     AU.addPreservedID(LiveVariablesID);
158     MachineFunctionPass::getAnalysisUsage(AU);
159   }
160 };
161 
162 } // end anonymous namespace
163 
164 char SILowerControlFlow::ID = 0;
165 
166 INITIALIZE_PASS(SILowerControlFlow, DEBUG_TYPE,
167                "SI lower control flow", false, false)
168 
setImpSCCDefDead(MachineInstr & MI,bool IsDead)169 static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
170   MachineOperand &ImpDefSCC = MI.getOperand(3);
171   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
172 
173   ImpDefSCC.setIsDead(IsDead);
174 }
175 
176 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
177 
hasKill(const MachineBasicBlock * Begin,const MachineBasicBlock * End)178 bool SILowerControlFlow::hasKill(const MachineBasicBlock *Begin,
179                                  const MachineBasicBlock *End) {
180   DenseSet<const MachineBasicBlock*> Visited;
181   SmallVector<MachineBasicBlock *, 4> Worklist(Begin->successors());
182 
183   while (!Worklist.empty()) {
184     MachineBasicBlock *MBB = Worklist.pop_back_val();
185 
186     if (MBB == End || !Visited.insert(MBB).second)
187       continue;
188     if (KillBlocks.contains(MBB))
189       return true;
190 
191     Worklist.append(MBB->succ_begin(), MBB->succ_end());
192   }
193 
194   return false;
195 }
196 
isSimpleIf(const MachineInstr & MI,const MachineRegisterInfo * MRI)197 static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
198   Register SaveExecReg = MI.getOperand(0).getReg();
199   auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
200 
201   if (U == MRI->use_instr_nodbg_end() ||
202       std::next(U) != MRI->use_instr_nodbg_end() ||
203       U->getOpcode() != AMDGPU::SI_END_CF)
204     return false;
205 
206   return true;
207 }
208 
emitIf(MachineInstr & MI)209 void SILowerControlFlow::emitIf(MachineInstr &MI) {
210   MachineBasicBlock &MBB = *MI.getParent();
211   const DebugLoc &DL = MI.getDebugLoc();
212   MachineBasicBlock::iterator I(&MI);
213   Register SaveExecReg = MI.getOperand(0).getReg();
214   MachineOperand& Cond = MI.getOperand(1);
215   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
216 
217   MachineOperand &ImpDefSCC = MI.getOperand(4);
218   assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
219 
220   // If there is only one use of save exec register and that use is SI_END_CF,
221   // we can optimize SI_IF by returning the full saved exec mask instead of
222   // just cleared bits.
223   bool SimpleIf = isSimpleIf(MI, MRI);
224 
225   if (SimpleIf) {
226     // Check for SI_KILL_*_TERMINATOR on path from if to endif.
227     // if there is any such terminator simplifications are not safe.
228     auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
229     SimpleIf = !hasKill(MI.getParent(), UseMI->getParent());
230   }
231 
232   // Add an implicit def of exec to discourage scheduling VALU after this which
233   // will interfere with trying to form s_and_saveexec_b64 later.
234   Register CopyReg = SimpleIf ? SaveExecReg
235                        : MRI->createVirtualRegister(BoolRC);
236   MachineInstr *CopyExec =
237     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
238     .addReg(Exec)
239     .addReg(Exec, RegState::ImplicitDefine);
240   LoweredIf.insert(CopyReg);
241 
242   Register Tmp = MRI->createVirtualRegister(BoolRC);
243 
244   MachineInstr *And =
245     BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
246     .addReg(CopyReg)
247     .add(Cond);
248   if (LV)
249     LV->replaceKillInstruction(Cond.getReg(), MI, *And);
250 
251   setImpSCCDefDead(*And, true);
252 
253   MachineInstr *Xor = nullptr;
254   if (!SimpleIf) {
255     Xor =
256       BuildMI(MBB, I, DL, TII->get(XorOpc), SaveExecReg)
257       .addReg(Tmp)
258       .addReg(CopyReg);
259     setImpSCCDefDead(*Xor, ImpDefSCC.isDead());
260   }
261 
262   // Use a copy that is a terminator to get correct spill code placement it with
263   // fast regalloc.
264   MachineInstr *SetExec =
265     BuildMI(MBB, I, DL, TII->get(MovTermOpc), Exec)
266     .addReg(Tmp, RegState::Kill);
267   if (LV)
268     LV->getVarInfo(Tmp).Kills.push_back(SetExec);
269 
270   // Skip ahead to the unconditional branch in case there are other terminators
271   // present.
272   I = skipToUncondBrOrEnd(MBB, I);
273 
274   // Insert the S_CBRANCH_EXECZ instruction which will be optimized later
275   // during SIRemoveShortExecBranches.
276   MachineInstr *NewBr = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
277                             .add(MI.getOperand(2));
278 
279   if (!LIS) {
280     MI.eraseFromParent();
281     return;
282   }
283 
284   LIS->InsertMachineInstrInMaps(*CopyExec);
285 
286   // Replace with and so we don't need to fix the live interval for condition
287   // register.
288   LIS->ReplaceMachineInstrInMaps(MI, *And);
289 
290   if (!SimpleIf)
291     LIS->InsertMachineInstrInMaps(*Xor);
292   LIS->InsertMachineInstrInMaps(*SetExec);
293   LIS->InsertMachineInstrInMaps(*NewBr);
294 
295   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
296   MI.eraseFromParent();
297 
298   // FIXME: Is there a better way of adjusting the liveness? It shouldn't be
299   // hard to add another def here but I'm not sure how to correctly update the
300   // valno.
301   RecomputeRegs.insert(SaveExecReg);
302   LIS->createAndComputeVirtRegInterval(Tmp);
303   if (!SimpleIf)
304     LIS->createAndComputeVirtRegInterval(CopyReg);
305 }
306 
emitElse(MachineInstr & MI)307 void SILowerControlFlow::emitElse(MachineInstr &MI) {
308   MachineBasicBlock &MBB = *MI.getParent();
309   const DebugLoc &DL = MI.getDebugLoc();
310 
311   Register DstReg = MI.getOperand(0).getReg();
312   Register SrcReg = MI.getOperand(1).getReg();
313 
314   MachineBasicBlock::iterator Start = MBB.begin();
315 
316   // This must be inserted before phis and any spill code inserted before the
317   // else.
318   Register SaveReg = MRI->createVirtualRegister(BoolRC);
319   MachineInstr *OrSaveExec =
320     BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
321     .add(MI.getOperand(1)); // Saved EXEC
322   if (LV)
323     LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
324 
325   MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
326 
327   MachineBasicBlock::iterator ElsePt(MI);
328 
329   // This accounts for any modification of the EXEC mask within the block and
330   // can be optimized out pre-RA when not required.
331   MachineInstr *And = BuildMI(MBB, ElsePt, DL, TII->get(AndOpc), DstReg)
332                           .addReg(Exec)
333                           .addReg(SaveReg);
334 
335   MachineInstr *Xor =
336     BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
337     .addReg(Exec)
338     .addReg(DstReg);
339 
340   // Skip ahead to the unconditional branch in case there are other terminators
341   // present.
342   ElsePt = skipToUncondBrOrEnd(MBB, ElsePt);
343 
344   MachineInstr *Branch =
345       BuildMI(MBB, ElsePt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
346           .addMBB(DestBB);
347 
348   if (!LIS) {
349     MI.eraseFromParent();
350     return;
351   }
352 
353   LIS->RemoveMachineInstrFromMaps(MI);
354   MI.eraseFromParent();
355 
356   LIS->InsertMachineInstrInMaps(*OrSaveExec);
357   LIS->InsertMachineInstrInMaps(*And);
358 
359   LIS->InsertMachineInstrInMaps(*Xor);
360   LIS->InsertMachineInstrInMaps(*Branch);
361 
362   RecomputeRegs.insert(SrcReg);
363   RecomputeRegs.insert(DstReg);
364   LIS->createAndComputeVirtRegInterval(SaveReg);
365 
366   // Let this be recomputed.
367   LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
368 }
369 
emitIfBreak(MachineInstr & MI)370 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
371   MachineBasicBlock &MBB = *MI.getParent();
372   const DebugLoc &DL = MI.getDebugLoc();
373   auto Dst = MI.getOperand(0).getReg();
374 
375   // Skip ANDing with exec if the break condition is already masked by exec
376   // because it is a V_CMP in the same basic block. (We know the break
377   // condition operand was an i1 in IR, so if it is a VALU instruction it must
378   // be one with a carry-out.)
379   bool SkipAnding = false;
380   if (MI.getOperand(1).isReg()) {
381     if (MachineInstr *Def = MRI->getUniqueVRegDef(MI.getOperand(1).getReg())) {
382       SkipAnding = Def->getParent() == MI.getParent()
383           && SIInstrInfo::isVALU(*Def);
384     }
385   }
386 
387   // AND the break condition operand with exec, then OR that into the "loop
388   // exit" mask.
389   MachineInstr *And = nullptr, *Or = nullptr;
390   Register AndReg;
391   if (!SkipAnding) {
392     AndReg = MRI->createVirtualRegister(BoolRC);
393     And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
394              .addReg(Exec)
395              .add(MI.getOperand(1));
396     if (LV)
397       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *And);
398     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
399              .addReg(AndReg)
400              .add(MI.getOperand(2));
401   } else {
402     Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
403              .add(MI.getOperand(1))
404              .add(MI.getOperand(2));
405     if (LV)
406       LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *Or);
407   }
408   if (LV)
409     LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or);
410 
411   if (LIS) {
412     LIS->ReplaceMachineInstrInMaps(MI, *Or);
413     if (And) {
414       // Read of original operand 1 is on And now not Or.
415       RecomputeRegs.insert(And->getOperand(2).getReg());
416       LIS->InsertMachineInstrInMaps(*And);
417       LIS->createAndComputeVirtRegInterval(AndReg);
418     }
419   }
420 
421   MI.eraseFromParent();
422 }
423 
emitLoop(MachineInstr & MI)424 void SILowerControlFlow::emitLoop(MachineInstr &MI) {
425   MachineBasicBlock &MBB = *MI.getParent();
426   const DebugLoc &DL = MI.getDebugLoc();
427 
428   MachineInstr *AndN2 =
429       BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
430           .addReg(Exec)
431           .add(MI.getOperand(0));
432   if (LV)
433     LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2);
434 
435   auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
436   MachineInstr *Branch =
437       BuildMI(MBB, BranchPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
438           .add(MI.getOperand(1));
439 
440   if (LIS) {
441     RecomputeRegs.insert(MI.getOperand(0).getReg());
442     LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
443     LIS->InsertMachineInstrInMaps(*Branch);
444   }
445 
446   MI.eraseFromParent();
447 }
448 
449 MachineBasicBlock::iterator
skipIgnoreExecInstsTrivialSucc(MachineBasicBlock & MBB,MachineBasicBlock::iterator It) const450 SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
451   MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
452 
453   SmallSet<const MachineBasicBlock *, 4> Visited;
454   MachineBasicBlock *B = &MBB;
455   do {
456     if (!Visited.insert(B).second)
457       return MBB.end();
458 
459     auto E = B->end();
460     for ( ; It != E; ++It) {
461       if (TII->mayReadEXEC(*MRI, *It))
462         break;
463     }
464 
465     if (It != E)
466       return It;
467 
468     if (B->succ_size() != 1)
469       return MBB.end();
470 
471     // If there is one trivial successor, advance to the next block.
472     MachineBasicBlock *Succ = *B->succ_begin();
473 
474     It = Succ->begin();
475     B = Succ;
476   } while (true);
477 }
478 
emitEndCf(MachineInstr & MI)479 MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
480   MachineBasicBlock &MBB = *MI.getParent();
481   const DebugLoc &DL = MI.getDebugLoc();
482 
483   MachineBasicBlock::iterator InsPt = MBB.begin();
484 
485   // If we have instructions that aren't prolog instructions, split the block
486   // and emit a terminator instruction. This ensures correct spill placement.
487   // FIXME: We should unconditionally split the block here.
488   bool NeedBlockSplit = false;
489   Register DataReg = MI.getOperand(0).getReg();
490   for (MachineBasicBlock::iterator I = InsPt, E = MI.getIterator();
491        I != E; ++I) {
492     if (I->modifiesRegister(DataReg, TRI)) {
493       NeedBlockSplit = true;
494       break;
495     }
496   }
497 
498   unsigned Opcode = OrOpc;
499   MachineBasicBlock *SplitBB = &MBB;
500   if (NeedBlockSplit) {
501     SplitBB = MBB.splitAt(MI, /*UpdateLiveIns*/true, LIS);
502     if (MDT && SplitBB != &MBB) {
503       MachineDomTreeNode *MBBNode = (*MDT)[&MBB];
504       SmallVector<MachineDomTreeNode *> Children(MBBNode->begin(),
505                                                  MBBNode->end());
506       MachineDomTreeNode *SplitBBNode = MDT->addNewBlock(SplitBB, &MBB);
507       for (MachineDomTreeNode *Child : Children)
508         MDT->changeImmediateDominator(Child, SplitBBNode);
509     }
510     Opcode = OrTermrOpc;
511     InsPt = MI;
512   }
513 
514   MachineInstr *NewMI =
515     BuildMI(MBB, InsPt, DL, TII->get(Opcode), Exec)
516     .addReg(Exec)
517     .add(MI.getOperand(0));
518   if (LV) {
519     LV->replaceKillInstruction(DataReg, MI, *NewMI);
520 
521     if (SplitBB != &MBB) {
522       // Track the set of registers defined in the original block so we don't
523       // accidentally add the original block to AliveBlocks. AliveBlocks only
524       // includes blocks which are live through, which excludes live outs and
525       // local defs.
526       DenseSet<Register> DefInOrigBlock;
527 
528       for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) {
529         for (MachineInstr &X : *BlockPiece) {
530           for (MachineOperand &Op : X.all_defs()) {
531             if (Op.getReg().isVirtual())
532               DefInOrigBlock.insert(Op.getReg());
533           }
534         }
535       }
536 
537       for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
538         Register Reg = Register::index2VirtReg(i);
539         LiveVariables::VarInfo &VI = LV->getVarInfo(Reg);
540 
541         if (VI.AliveBlocks.test(MBB.getNumber()))
542           VI.AliveBlocks.set(SplitBB->getNumber());
543         else {
544           for (MachineInstr *Kill : VI.Kills) {
545             if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(Reg))
546               VI.AliveBlocks.set(MBB.getNumber());
547           }
548         }
549       }
550     }
551   }
552 
553   LoweredEndCf.insert(NewMI);
554 
555   if (LIS)
556     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
557 
558   MI.eraseFromParent();
559 
560   if (LIS)
561     LIS->handleMove(*NewMI);
562   return SplitBB;
563 }
564 
565 // Returns replace operands for a logical operation, either single result
566 // for exec or two operands if source was another equivalent operation.
findMaskOperands(MachineInstr & MI,unsigned OpNo,SmallVectorImpl<MachineOperand> & Src) const567 void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
568        SmallVectorImpl<MachineOperand> &Src) const {
569   MachineOperand &Op = MI.getOperand(OpNo);
570   if (!Op.isReg() || !Op.getReg().isVirtual()) {
571     Src.push_back(Op);
572     return;
573   }
574 
575   MachineInstr *Def = MRI->getUniqueVRegDef(Op.getReg());
576   if (!Def || Def->getParent() != MI.getParent() ||
577       !(Def->isFullCopy() || (Def->getOpcode() == MI.getOpcode())))
578     return;
579 
580   // Make sure we do not modify exec between def and use.
581   // A copy with implicitly defined exec inserted earlier is an exclusion, it
582   // does not really modify exec.
583   for (auto I = Def->getIterator(); I != MI.getIterator(); ++I)
584     if (I->modifiesRegister(AMDGPU::EXEC, TRI) &&
585         !(I->isCopy() && I->getOperand(0).getReg() != Exec))
586       return;
587 
588   for (const auto &SrcOp : Def->explicit_operands())
589     if (SrcOp.isReg() && SrcOp.isUse() &&
590         (SrcOp.getReg().isVirtual() || SrcOp.getReg() == Exec))
591       Src.push_back(SrcOp);
592 }
593 
594 // Search and combine pairs of equivalent instructions, like
595 // S_AND_B64 x, (S_AND_B64 x, y) => S_AND_B64 x, y
596 // S_OR_B64  x, (S_OR_B64  x, y) => S_OR_B64  x, y
597 // One of the operands is exec mask.
combineMasks(MachineInstr & MI)598 void SILowerControlFlow::combineMasks(MachineInstr &MI) {
599   assert(MI.getNumExplicitOperands() == 3);
600   SmallVector<MachineOperand, 4> Ops;
601   unsigned OpToReplace = 1;
602   findMaskOperands(MI, 1, Ops);
603   if (Ops.size() == 1) OpToReplace = 2; // First operand can be exec or its copy
604   findMaskOperands(MI, 2, Ops);
605   if (Ops.size() != 3) return;
606 
607   unsigned UniqueOpndIdx;
608   if (Ops[0].isIdenticalTo(Ops[1])) UniqueOpndIdx = 2;
609   else if (Ops[0].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
610   else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
611   else return;
612 
613   Register Reg = MI.getOperand(OpToReplace).getReg();
614   MI.removeOperand(OpToReplace);
615   MI.addOperand(Ops[UniqueOpndIdx]);
616   if (MRI->use_empty(Reg))
617     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
618 }
619 
optimizeEndCf()620 void SILowerControlFlow::optimizeEndCf() {
621   // If the only instruction immediately following this END_CF is another
622   // END_CF in the only successor we can avoid emitting exec mask restore here.
623   if (!EnableOptimizeEndCf)
624     return;
625 
626   for (MachineInstr *MI : reverse(LoweredEndCf)) {
627     MachineBasicBlock &MBB = *MI->getParent();
628     auto Next =
629       skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
630     if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
631       continue;
632     // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
633     // If that belongs to SI_ELSE then saved mask has an inverted value.
634     Register SavedExec
635       = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
636     assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
637 
638     const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
639     if (Def && LoweredIf.count(SavedExec)) {
640       LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
641       if (LIS)
642         LIS->RemoveMachineInstrFromMaps(*MI);
643       Register Reg;
644       if (LV)
645         Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::src1)->getReg();
646       MI->eraseFromParent();
647       if (LV)
648         LV->recomputeForSingleDefVirtReg(Reg);
649       removeMBBifRedundant(MBB);
650     }
651   }
652 }
653 
process(MachineInstr & MI)654 MachineBasicBlock *SILowerControlFlow::process(MachineInstr &MI) {
655   MachineBasicBlock &MBB = *MI.getParent();
656   MachineBasicBlock::iterator I(MI);
657   MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
658 
659   MachineBasicBlock *SplitBB = &MBB;
660 
661   switch (MI.getOpcode()) {
662   case AMDGPU::SI_IF:
663     emitIf(MI);
664     break;
665 
666   case AMDGPU::SI_ELSE:
667     emitElse(MI);
668     break;
669 
670   case AMDGPU::SI_IF_BREAK:
671     emitIfBreak(MI);
672     break;
673 
674   case AMDGPU::SI_LOOP:
675     emitLoop(MI);
676     break;
677 
678   case AMDGPU::SI_WATERFALL_LOOP:
679     MI.setDesc(TII->get(AMDGPU::S_CBRANCH_EXECNZ));
680     break;
681 
682   case AMDGPU::SI_END_CF:
683     SplitBB = emitEndCf(MI);
684     break;
685 
686   default:
687     assert(false && "Attempt to process unsupported instruction");
688     break;
689   }
690 
691   MachineBasicBlock::iterator Next;
692   for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
693     Next = std::next(I);
694     MachineInstr &MaskMI = *I;
695     switch (MaskMI.getOpcode()) {
696     case AMDGPU::S_AND_B64:
697     case AMDGPU::S_OR_B64:
698     case AMDGPU::S_AND_B32:
699     case AMDGPU::S_OR_B32:
700       // Cleanup bit manipulations on exec mask
701       combineMasks(MaskMI);
702       break;
703     default:
704       I = MBB.end();
705       break;
706     }
707   }
708 
709   return SplitBB;
710 }
711 
lowerInitExec(MachineBasicBlock * MBB,MachineInstr & MI)712 void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
713                                        MachineInstr &MI) {
714   MachineFunction &MF = *MBB->getParent();
715   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
716   bool IsWave32 = ST.isWave32();
717 
718   if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
719     // This should be before all vector instructions.
720     MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
721             TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
722         .addImm(MI.getOperand(0).getImm());
723     if (LIS) {
724       LIS->RemoveMachineInstrFromMaps(MI);
725       LIS->InsertMachineInstrInMaps(*InitMI);
726     }
727     MI.eraseFromParent();
728     return;
729   }
730 
731   // Extract the thread count from an SGPR input and set EXEC accordingly.
732   // Since BFM can't shift by 64, handle that case with CMP + CMOV.
733   //
734   // S_BFE_U32 count, input, {shift, 7}
735   // S_BFM_B64 exec, count, 0
736   // S_CMP_EQ_U32 count, 64
737   // S_CMOV_B64 exec, -1
738   Register InputReg = MI.getOperand(0).getReg();
739   MachineInstr *FirstMI = &*MBB->begin();
740   if (InputReg.isVirtual()) {
741     MachineInstr *DefInstr = MRI->getVRegDef(InputReg);
742     assert(DefInstr && DefInstr->isCopy());
743     if (DefInstr->getParent() == MBB) {
744       if (DefInstr != FirstMI) {
745         // If the `InputReg` is defined in current block, we also need to
746         // move that instruction to the beginning of the block.
747         DefInstr->removeFromParent();
748         MBB->insert(FirstMI, DefInstr);
749         if (LIS)
750           LIS->handleMove(*DefInstr);
751       } else {
752         // If first instruction is definition then move pointer after it.
753         FirstMI = &*std::next(FirstMI->getIterator());
754       }
755     }
756   }
757 
758   // Insert instruction sequence at block beginning (before vector operations).
759   const DebugLoc DL = MI.getDebugLoc();
760   const unsigned WavefrontSize = ST.getWavefrontSize();
761   const unsigned Mask = (WavefrontSize << 1) - 1;
762   Register CountReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
763   auto BfeMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_BFE_U32), CountReg)
764                    .addReg(InputReg)
765                    .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000);
766   if (LV)
767     LV->recomputeForSingleDefVirtReg(InputReg);
768   auto BfmMI =
769       BuildMI(*MBB, FirstMI, DL,
770               TII->get(IsWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), Exec)
771           .addReg(CountReg)
772           .addImm(0);
773   auto CmpMI = BuildMI(*MBB, FirstMI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
774                    .addReg(CountReg, RegState::Kill)
775                    .addImm(WavefrontSize);
776   if (LV)
777     LV->getVarInfo(CountReg).Kills.push_back(CmpMI);
778   auto CmovMI =
779       BuildMI(*MBB, FirstMI, DL,
780               TII->get(IsWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64),
781               Exec)
782           .addImm(-1);
783 
784   if (!LIS) {
785     MI.eraseFromParent();
786     return;
787   }
788 
789   LIS->RemoveMachineInstrFromMaps(MI);
790   MI.eraseFromParent();
791 
792   LIS->InsertMachineInstrInMaps(*BfeMI);
793   LIS->InsertMachineInstrInMaps(*BfmMI);
794   LIS->InsertMachineInstrInMaps(*CmpMI);
795   LIS->InsertMachineInstrInMaps(*CmovMI);
796 
797   RecomputeRegs.insert(InputReg);
798   LIS->createAndComputeVirtRegInterval(CountReg);
799 }
800 
removeMBBifRedundant(MachineBasicBlock & MBB)801 bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
802   for (auto &I : MBB.instrs()) {
803     if (!I.isDebugInstr() && !I.isUnconditionalBranch())
804       return false;
805   }
806 
807   assert(MBB.succ_size() == 1 && "MBB has more than one successor");
808 
809   MachineBasicBlock *Succ = *MBB.succ_begin();
810   MachineBasicBlock *FallThrough = nullptr;
811 
812   while (!MBB.predecessors().empty()) {
813     MachineBasicBlock *P = *MBB.pred_begin();
814     if (P->getFallThrough(false) == &MBB)
815       FallThrough = P;
816     P->ReplaceUsesOfBlockWith(&MBB, Succ);
817   }
818   MBB.removeSuccessor(Succ);
819   if (LIS) {
820     for (auto &I : MBB.instrs())
821       LIS->RemoveMachineInstrFromMaps(I);
822   }
823   if (MDT) {
824     // If Succ, the single successor of MBB, is dominated by MBB, MDT needs
825     // updating by changing Succ's idom to the one of MBB; otherwise, MBB must
826     // be a leaf node in MDT and could be erased directly.
827     if (MDT->dominates(&MBB, Succ))
828       MDT->changeImmediateDominator(MDT->getNode(Succ),
829                                     MDT->getNode(&MBB)->getIDom());
830     MDT->eraseNode(&MBB);
831   }
832   MBB.clear();
833   MBB.eraseFromParent();
834   if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
835     // Note: we cannot update block layout and preserve live intervals;
836     // hence we must insert a branch.
837     MachineInstr *BranchMI = BuildMI(*FallThrough, FallThrough->end(),
838             FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
839         .addMBB(Succ);
840     if (LIS)
841       LIS->InsertMachineInstrInMaps(*BranchMI);
842   }
843 
844   return true;
845 }
846 
runOnMachineFunction(MachineFunction & MF)847 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
848   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
849   TII = ST.getInstrInfo();
850   TRI = &TII->getRegisterInfo();
851   EnableOptimizeEndCf = RemoveRedundantEndcf &&
852                         MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
853 
854   // This doesn't actually need LiveIntervals, but we can preserve them.
855   LIS = getAnalysisIfAvailable<LiveIntervals>();
856   // This doesn't actually need LiveVariables, but we can preserve them.
857   LV = getAnalysisIfAvailable<LiveVariables>();
858   MDT = getAnalysisIfAvailable<MachineDominatorTree>();
859   MRI = &MF.getRegInfo();
860   BoolRC = TRI->getBoolRC();
861 
862   if (ST.isWave32()) {
863     AndOpc = AMDGPU::S_AND_B32;
864     OrOpc = AMDGPU::S_OR_B32;
865     XorOpc = AMDGPU::S_XOR_B32;
866     MovTermOpc = AMDGPU::S_MOV_B32_term;
867     Andn2TermOpc = AMDGPU::S_ANDN2_B32_term;
868     XorTermrOpc = AMDGPU::S_XOR_B32_term;
869     OrTermrOpc = AMDGPU::S_OR_B32_term;
870     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
871     Exec = AMDGPU::EXEC_LO;
872   } else {
873     AndOpc = AMDGPU::S_AND_B64;
874     OrOpc = AMDGPU::S_OR_B64;
875     XorOpc = AMDGPU::S_XOR_B64;
876     MovTermOpc = AMDGPU::S_MOV_B64_term;
877     Andn2TermOpc = AMDGPU::S_ANDN2_B64_term;
878     XorTermrOpc = AMDGPU::S_XOR_B64_term;
879     OrTermrOpc = AMDGPU::S_OR_B64_term;
880     OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
881     Exec = AMDGPU::EXEC;
882   }
883 
884   // Compute set of blocks with kills
885   const bool CanDemote =
886       MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
887   for (auto &MBB : MF) {
888     bool IsKillBlock = false;
889     for (auto &Term : MBB.terminators()) {
890       if (TII->isKillTerminator(Term.getOpcode())) {
891         KillBlocks.insert(&MBB);
892         IsKillBlock = true;
893         break;
894       }
895     }
896     if (CanDemote && !IsKillBlock) {
897       for (auto &MI : MBB) {
898         if (MI.getOpcode() == AMDGPU::SI_DEMOTE_I1) {
899           KillBlocks.insert(&MBB);
900           break;
901         }
902       }
903     }
904   }
905 
906   bool Changed = false;
907   MachineFunction::iterator NextBB;
908   for (MachineFunction::iterator BI = MF.begin();
909        BI != MF.end(); BI = NextBB) {
910     NextBB = std::next(BI);
911     MachineBasicBlock *MBB = &*BI;
912 
913     MachineBasicBlock::iterator I, E, Next;
914     E = MBB->end();
915     for (I = MBB->begin(); I != E; I = Next) {
916       Next = std::next(I);
917       MachineInstr &MI = *I;
918       MachineBasicBlock *SplitMBB = MBB;
919 
920       switch (MI.getOpcode()) {
921       case AMDGPU::SI_IF:
922       case AMDGPU::SI_ELSE:
923       case AMDGPU::SI_IF_BREAK:
924       case AMDGPU::SI_WATERFALL_LOOP:
925       case AMDGPU::SI_LOOP:
926       case AMDGPU::SI_END_CF:
927         SplitMBB = process(MI);
928         Changed = true;
929         break;
930 
931       // FIXME: find a better place for this
932       case AMDGPU::SI_INIT_EXEC:
933       case AMDGPU::SI_INIT_EXEC_FROM_INPUT:
934         lowerInitExec(MBB, MI);
935         if (LIS)
936           LIS->removeAllRegUnitsForPhysReg(AMDGPU::EXEC);
937         Changed = true;
938         break;
939 
940       default:
941         break;
942       }
943 
944       if (SplitMBB != MBB) {
945         MBB = Next->getParent();
946         E = MBB->end();
947       }
948     }
949   }
950 
951   optimizeEndCf();
952 
953   if (LIS) {
954     for (Register Reg : RecomputeRegs) {
955       LIS->removeInterval(Reg);
956       LIS->createAndComputeVirtRegInterval(Reg);
957     }
958   }
959 
960   RecomputeRegs.clear();
961   LoweredEndCf.clear();
962   LoweredIf.clear();
963   KillBlocks.clear();
964 
965   return Changed;
966 }
967