1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass inserts branches on the 0 exec mask over divergent branches
11 /// branches when it's expected that jumping over the untaken control flow will
12 /// be cheaper than having every workitem no-op through it.
13 //
14 //===----------------------------------------------------------------------===//
15 
16 #include "AMDGPU.h"
17 #include "AMDGPUSubtarget.h"
18 #include "SIInstrInfo.h"
19 #include "SIMachineFunctionInfo.h"
20 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
21 #include "llvm/ADT/DepthFirstIterator.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineInstr.h"
29 #include "llvm/CodeGen/MachineInstrBuilder.h"
30 #include "llvm/CodeGen/MachineOperand.h"
31 #include "llvm/IR/CallingConv.h"
32 #include "llvm/IR/DebugLoc.h"
33 #include "llvm/InitializePasses.h"
34 #include "llvm/MC/MCAsmInfo.h"
35 #include "llvm/Pass.h"
36 #include "llvm/Support/CommandLine.h"
37 #include "llvm/Target/TargetMachine.h"
38 #include <cassert>
39 #include <cstdint>
40 #include <iterator>
41 
42 using namespace llvm;
43 
44 #define DEBUG_TYPE "si-insert-skips"
45 
46 static cl::opt<unsigned> SkipThresholdFlag(
47   "amdgpu-skip-threshold-legacy",
48   cl::desc("Number of instructions before jumping over divergent control flow"),
49   cl::init(12), cl::Hidden);
50 
51 namespace {
52 
53 class SIInsertSkips : public MachineFunctionPass {
54 private:
55   const SIRegisterInfo *TRI = nullptr;
56   const SIInstrInfo *TII = nullptr;
57   unsigned SkipThreshold = 0;
58   MachineDominatorTree *MDT = nullptr;
59 
60   MachineBasicBlock *EarlyExitBlock = nullptr;
61 
62   bool shouldSkip(const MachineBasicBlock &From,
63                   const MachineBasicBlock &To) const;
64 
65   bool dominatesAllReachable(MachineBasicBlock &MBB);
66   void createEarlyExitBlock(MachineBasicBlock &MBB);
67   void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
68                   DebugLoc DL);
69 
70   bool kill(MachineInstr &MI);
71 
72   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
73 
74 public:
75   static char ID;
76 
SIInsertSkips()77   SIInsertSkips() : MachineFunctionPass(ID) {}
78 
79   bool runOnMachineFunction(MachineFunction &MF) override;
80 
getPassName() const81   StringRef getPassName() const override {
82     return "SI insert s_cbranch_execz instructions";
83   }
84 
getAnalysisUsage(AnalysisUsage & AU) const85   void getAnalysisUsage(AnalysisUsage &AU) const override {
86     AU.addRequired<MachineDominatorTree>();
87     AU.addPreserved<MachineDominatorTree>();
88     MachineFunctionPass::getAnalysisUsage(AU);
89   }
90 };
91 
92 } // end anonymous namespace
93 
94 char SIInsertSkips::ID = 0;
95 
96 INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
97                       "SI insert s_cbranch_execz instructions", false, false)
98 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
99 INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
100                     "SI insert s_cbranch_execz instructions", false, false)
101 
102 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
103 
opcodeEmitsNoInsts(const MachineInstr & MI)104 static bool opcodeEmitsNoInsts(const MachineInstr &MI) {
105   if (MI.isMetaInstruction())
106     return true;
107 
108   // Handle target specific opcodes.
109   switch (MI.getOpcode()) {
110   case AMDGPU::SI_MASK_BRANCH:
111     return true;
112   default:
113     return false;
114   }
115 }
116 
shouldSkip(const MachineBasicBlock & From,const MachineBasicBlock & To) const117 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
118                                const MachineBasicBlock &To) const {
119   unsigned NumInstr = 0;
120   const MachineFunction *MF = From.getParent();
121 
122   for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
123        MBBI != End && MBBI != ToI; ++MBBI) {
124     const MachineBasicBlock &MBB = *MBBI;
125 
126     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
127          NumInstr < SkipThreshold && I != E; ++I) {
128       if (opcodeEmitsNoInsts(*I))
129         continue;
130 
131       // FIXME: Since this is required for correctness, this should be inserted
132       // during SILowerControlFlow.
133 
134       // When a uniform loop is inside non-uniform control flow, the branch
135       // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
136       // when EXEC = 0. We should skip the loop lest it becomes infinite.
137       if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
138           I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
139         return true;
140 
141       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
142         return true;
143 
144       // These instructions are potentially expensive even if EXEC = 0.
145       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
146           I->getOpcode() == AMDGPU::S_WAITCNT)
147         return true;
148 
149       ++NumInstr;
150       if (NumInstr >= SkipThreshold)
151         return true;
152     }
153   }
154 
155   return false;
156 }
157 
158 /// Check whether \p MBB dominates all blocks that are reachable from it.
dominatesAllReachable(MachineBasicBlock & MBB)159 bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
160   for (MachineBasicBlock *Other : depth_first(&MBB)) {
161     if (!MDT->dominates(&MBB, Other))
162       return false;
163   }
164   return true;
165 }
166 
generatePsEndPgm(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL,const SIInstrInfo * TII)167 static void generatePsEndPgm(MachineBasicBlock &MBB,
168                              MachineBasicBlock::iterator I, DebugLoc DL,
169                              const SIInstrInfo *TII) {
170   // Generate "null export; s_endpgm".
171   BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
172       .addImm(0x09) // V_008DFC_SQ_EXP_NULL
173       .addReg(AMDGPU::VGPR0, RegState::Undef)
174       .addReg(AMDGPU::VGPR0, RegState::Undef)
175       .addReg(AMDGPU::VGPR0, RegState::Undef)
176       .addReg(AMDGPU::VGPR0, RegState::Undef)
177       .addImm(1)  // vm
178       .addImm(0)  // compr
179       .addImm(0); // en
180   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
181 }
182 
createEarlyExitBlock(MachineBasicBlock & MBB)183 void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
184   MachineFunction *MF = MBB.getParent();
185   DebugLoc DL;
186 
187   assert(!EarlyExitBlock);
188   EarlyExitBlock = MF->CreateMachineBasicBlock();
189   MF->insert(MF->end(), EarlyExitBlock);
190 
191   generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
192 }
193 
194 /// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
195 /// iterator. Only applies to pixel shaders.
skipIfDead(MachineBasicBlock & MBB,MachineBasicBlock::iterator I,DebugLoc DL)196 void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
197                                MachineBasicBlock::iterator I, DebugLoc DL) {
198   MachineFunction *MF = MBB.getParent();
199   assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
200 
201   // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
202   // basic block that has no further successors (e.g., there was an
203   // `unreachable` there in IR). This can happen with original source of the
204   // form:
205   //
206   //   if (uniform_condition) {
207   //     write_to_memory();
208   //     discard;
209   //   }
210   //
211   // In this case, we write the "null_export; s_endpgm" skip code in the
212   // already-existing basic block.
213   auto NextBBI = std::next(MBB.getIterator());
214   bool NoSuccessor = I == MBB.end() &&
215                      llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
216 
217   if (NoSuccessor) {
218     generatePsEndPgm(MBB, I, DL, TII);
219   } else {
220     if (!EarlyExitBlock) {
221       createEarlyExitBlock(MBB);
222       // Update next block pointer to reflect any new blocks
223       NextBBI = std::next(MBB.getIterator());
224     }
225 
226     auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
227                         .addMBB(EarlyExitBlock);
228 
229     // Split the block if the branch will not come at the end.
230     auto Next = std::next(BranchMI->getIterator());
231     if (Next != MBB.end() && !Next->isTerminator()) {
232       MachineBasicBlock *SplitBB =
233           MF->CreateMachineBasicBlock(MBB.getBasicBlock());
234       MF->insert(NextBBI, SplitBB);
235       SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
236       SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
237       // FIXME: the expectation is that this will be used near the beginning
238       //        of a block so just assume all registers are still live.
239       for (auto LiveIn : MBB.liveins())
240         SplitBB->addLiveIn(LiveIn);
241       MBB.addSuccessor(SplitBB);
242 
243       // Update dominator tree
244       using DomTreeT = DomTreeBase<MachineBasicBlock>;
245       SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
246       for (MachineBasicBlock *Succ : SplitBB->successors()) {
247         DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
248         DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
249       }
250       DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
251       MDT->getBase().applyUpdates(DTUpdates);
252     }
253 
254     MBB.addSuccessor(EarlyExitBlock);
255     MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
256   }
257 }
258 
259 /// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
260 /// Return true unless the terminator is a no-op.
kill(MachineInstr & MI)261 bool SIInsertSkips::kill(MachineInstr &MI) {
262   MachineBasicBlock &MBB = *MI.getParent();
263   DebugLoc DL = MI.getDebugLoc();
264 
265   switch (MI.getOpcode()) {
266   case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
267     unsigned Opcode = 0;
268 
269     // The opcodes are inverted because the inline immediate has to be
270     // the first operand, e.g. from "x < imm" to "imm > x"
271     switch (MI.getOperand(2).getImm()) {
272     case ISD::SETOEQ:
273     case ISD::SETEQ:
274       Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
275       break;
276     case ISD::SETOGT:
277     case ISD::SETGT:
278       Opcode = AMDGPU::V_CMPX_LT_F32_e64;
279       break;
280     case ISD::SETOGE:
281     case ISD::SETGE:
282       Opcode = AMDGPU::V_CMPX_LE_F32_e64;
283       break;
284     case ISD::SETOLT:
285     case ISD::SETLT:
286       Opcode = AMDGPU::V_CMPX_GT_F32_e64;
287       break;
288     case ISD::SETOLE:
289     case ISD::SETLE:
290       Opcode = AMDGPU::V_CMPX_GE_F32_e64;
291       break;
292     case ISD::SETONE:
293     case ISD::SETNE:
294       Opcode = AMDGPU::V_CMPX_LG_F32_e64;
295       break;
296     case ISD::SETO:
297       Opcode = AMDGPU::V_CMPX_O_F32_e64;
298       break;
299     case ISD::SETUO:
300       Opcode = AMDGPU::V_CMPX_U_F32_e64;
301       break;
302     case ISD::SETUEQ:
303       Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
304       break;
305     case ISD::SETUGT:
306       Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
307       break;
308     case ISD::SETUGE:
309       Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
310       break;
311     case ISD::SETULT:
312       Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
313       break;
314     case ISD::SETULE:
315       Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
316       break;
317     case ISD::SETUNE:
318       Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
319       break;
320     default:
321       llvm_unreachable("invalid ISD:SET cond code");
322     }
323 
324     const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
325     if (ST.hasNoSdstCMPX())
326       Opcode = AMDGPU::getVCMPXNoSDstOp(Opcode);
327 
328     assert(MI.getOperand(0).isReg());
329 
330     if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
331                     MI.getOperand(0).getReg())) {
332       Opcode = AMDGPU::getVOPe32(Opcode);
333       BuildMI(MBB, &MI, DL, TII->get(Opcode))
334           .add(MI.getOperand(1))
335           .add(MI.getOperand(0));
336     } else {
337       auto I = BuildMI(MBB, &MI, DL, TII->get(Opcode));
338       if (!ST.hasNoSdstCMPX())
339         I.addReg(AMDGPU::VCC, RegState::Define);
340 
341       I.addImm(0)  // src0 modifiers
342         .add(MI.getOperand(1))
343         .addImm(0)  // src1 modifiers
344         .add(MI.getOperand(0));
345 
346       I.addImm(0);  // omod
347     }
348     return true;
349   }
350   case AMDGPU::SI_KILL_I1_TERMINATOR: {
351     const MachineFunction *MF = MI.getParent()->getParent();
352     const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
353     unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
354     const MachineOperand &Op = MI.getOperand(0);
355     int64_t KillVal = MI.getOperand(1).getImm();
356     assert(KillVal == 0 || KillVal == -1);
357 
358     // Kill all threads if Op0 is an immediate and equal to the Kill value.
359     if (Op.isImm()) {
360       int64_t Imm = Op.getImm();
361       assert(Imm == 0 || Imm == -1);
362 
363       if (Imm == KillVal) {
364         BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
365                                                      : AMDGPU::S_MOV_B64), Exec)
366           .addImm(0);
367         return true;
368       }
369       return false;
370     }
371 
372     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
373     if (ST.isWave32())
374       Opcode = KillVal ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_AND_B32;
375     BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
376         .addReg(Exec)
377         .add(Op);
378     return true;
379   }
380   default:
381     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
382   }
383 }
384 
385 // Returns true if a branch over the block was inserted.
skipMaskBranch(MachineInstr & MI,MachineBasicBlock & SrcMBB)386 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
387                                    MachineBasicBlock &SrcMBB) {
388   MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
389 
390   if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
391     return false;
392 
393   const DebugLoc &DL = MI.getDebugLoc();
394   MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
395 
396   BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
397     .addMBB(DestBB);
398 
399   return true;
400 }
401 
runOnMachineFunction(MachineFunction & MF)402 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
403   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
404   TII = ST.getInstrInfo();
405   TRI = &TII->getRegisterInfo();
406   MDT = &getAnalysis<MachineDominatorTree>();
407   SkipThreshold = SkipThresholdFlag;
408 
409   SmallVector<MachineInstr *, 4> KillInstrs;
410   bool MadeChange = false;
411 
412   for (MachineBasicBlock &MBB : MF) {
413     MachineBasicBlock::iterator I, Next;
414     for (I = MBB.begin(); I != MBB.end(); I = Next) {
415       Next = std::next(I);
416       MachineInstr &MI = *I;
417 
418       switch (MI.getOpcode()) {
419       case AMDGPU::SI_MASK_BRANCH:
420         MadeChange |= skipMaskBranch(MI, MBB);
421         break;
422 
423       case AMDGPU::S_BRANCH:
424         // Optimize out branches to the next block.
425         // FIXME: Shouldn't this be handled by BranchFolding?
426         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
427           assert(&MI == &MBB.back());
428           MI.eraseFromParent();
429           MadeChange = true;
430         }
431         break;
432 
433       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
434       case AMDGPU::SI_KILL_I1_TERMINATOR: {
435         MadeChange = true;
436         bool CanKill = kill(MI);
437 
438         // Check if we can add an early "if exec=0 { end shader }".
439         //
440         // Note that we _always_ do this if it is correct, even if the kill
441         // happens fairly late in the shader, because the null export should
442         // generally still be cheaper than normal export(s).
443         //
444         // TODO: The dominatesAllReachable check is conservative: if the
445         //       dominance is only missing due to _uniform_ branches, we could
446         //       in fact insert the early-exit as well.
447         if (CanKill &&
448             MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
449             dominatesAllReachable(MBB)) {
450           // Mark the instruction for kill-if-dead insertion. We delay this
451           // change because it modifies the CFG.
452           KillInstrs.push_back(&MI);
453         } else {
454           MI.eraseFromParent();
455         }
456         break;
457       }
458 
459       case AMDGPU::SI_KILL_CLEANUP:
460         if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
461             dominatesAllReachable(MBB)) {
462           KillInstrs.push_back(&MI);
463         } else {
464           MI.eraseFromParent();
465         }
466         break;
467 
468       default:
469         break;
470       }
471     }
472   }
473 
474   for (MachineInstr *Kill : KillInstrs) {
475     skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
476                Kill->getDebugLoc());
477     Kill->eraseFromParent();
478   }
479   KillInstrs.clear();
480   EarlyExitBlock = nullptr;
481 
482   return MadeChange;
483 }
484