1 //===-- SIInsertSkips.cpp - Use predicates for control flow ---------------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 /// \file
11 /// This pass inserts branches on the 0 exec mask over divergent branches
12 /// branches when it's expected that jumping over the untaken control flow will
13 /// be cheaper than having every workitem no-op through it.
14 //
15 //===----------------------------------------------------------------------===//
16
17 #include "AMDGPU.h"
18 #include "AMDGPUSubtarget.h"
19 #include "SIInstrInfo.h"
20 #include "SIMachineFunctionInfo.h"
21 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/CodeGen/MachineBasicBlock.h"
25 #include "llvm/CodeGen/MachineFunction.h"
26 #include "llvm/CodeGen/MachineFunctionPass.h"
27 #include "llvm/CodeGen/MachineInstr.h"
28 #include "llvm/CodeGen/MachineInstrBuilder.h"
29 #include "llvm/CodeGen/MachineOperand.h"
30 #include "llvm/IR/CallingConv.h"
31 #include "llvm/IR/DebugLoc.h"
32 #include "llvm/MC/MCAsmInfo.h"
33 #include "llvm/Pass.h"
34 #include "llvm/Support/CommandLine.h"
35 #include "llvm/Target/TargetMachine.h"
36 #include <cassert>
37 #include <cstdint>
38 #include <iterator>
39
40 using namespace llvm;
41
42 #define DEBUG_TYPE "si-insert-skips"
43
44 static cl::opt<unsigned> SkipThresholdFlag(
45 "amdgpu-skip-threshold",
46 cl::desc("Number of instructions before jumping over divergent control flow"),
47 cl::init(12), cl::Hidden);
48
49 namespace {
50
51 class SIInsertSkips : public MachineFunctionPass {
52 private:
53 const SIRegisterInfo *TRI = nullptr;
54 const SIInstrInfo *TII = nullptr;
55 unsigned SkipThreshold = 0;
56
57 bool shouldSkip(const MachineBasicBlock &From,
58 const MachineBasicBlock &To) const;
59
60 bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
61
62 void kill(MachineInstr &MI);
63
64 MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
65 MachineBasicBlock::iterator I) const;
66
67 bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
68
69 bool optimizeVccBranch(MachineInstr &MI) const;
70
71 public:
72 static char ID;
73
SIInsertSkips()74 SIInsertSkips() : MachineFunctionPass(ID) {}
75
76 bool runOnMachineFunction(MachineFunction &MF) override;
77
getPassName() const78 StringRef getPassName() const override {
79 return "SI insert s_cbranch_execz instructions";
80 }
81
getAnalysisUsage(AnalysisUsage & AU) const82 void getAnalysisUsage(AnalysisUsage &AU) const override {
83 MachineFunctionPass::getAnalysisUsage(AU);
84 }
85 };
86
87 } // end anonymous namespace
88
89 char SIInsertSkips::ID = 0;
90
91 INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
92 "SI insert s_cbranch_execz instructions", false, false)
93
94 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
95
opcodeEmitsNoInsts(unsigned Opc)96 static bool opcodeEmitsNoInsts(unsigned Opc) {
97 switch (Opc) {
98 case TargetOpcode::IMPLICIT_DEF:
99 case TargetOpcode::KILL:
100 case TargetOpcode::BUNDLE:
101 case TargetOpcode::CFI_INSTRUCTION:
102 case TargetOpcode::EH_LABEL:
103 case TargetOpcode::GC_LABEL:
104 case TargetOpcode::DBG_VALUE:
105 return true;
106 default:
107 return false;
108 }
109 }
110
shouldSkip(const MachineBasicBlock & From,const MachineBasicBlock & To) const111 bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
112 const MachineBasicBlock &To) const {
113 if (From.succ_empty())
114 return false;
115
116 unsigned NumInstr = 0;
117 const MachineFunction *MF = From.getParent();
118
119 for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
120 MBBI != End && MBBI != ToI; ++MBBI) {
121 const MachineBasicBlock &MBB = *MBBI;
122
123 for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
124 NumInstr < SkipThreshold && I != E; ++I) {
125 if (opcodeEmitsNoInsts(I->getOpcode()))
126 continue;
127
128 // FIXME: Since this is required for correctness, this should be inserted
129 // during SILowerControlFlow.
130
131 // When a uniform loop is inside non-uniform control flow, the branch
132 // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
133 // when EXEC = 0. We should skip the loop lest it becomes infinite.
134 if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
135 I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
136 return true;
137
138 if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
139 return true;
140
141 ++NumInstr;
142 if (NumInstr >= SkipThreshold)
143 return true;
144 }
145 }
146
147 return false;
148 }
149
skipIfDead(MachineInstr & MI,MachineBasicBlock & NextBB)150 bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
151 MachineBasicBlock &MBB = *MI.getParent();
152 MachineFunction *MF = MBB.getParent();
153
154 if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
155 !shouldSkip(MBB, MBB.getParent()->back()))
156 return false;
157
158 MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
159
160 const DebugLoc &DL = MI.getDebugLoc();
161
162 // If the exec mask is non-zero, skip the next two instructions
163 BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
164 .addMBB(&NextBB);
165
166 MachineBasicBlock::iterator Insert = SkipBB->begin();
167
168 // Exec mask is zero: Export to NULL target...
169 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
170 .addImm(0x09) // V_008DFC_SQ_EXP_NULL
171 .addReg(AMDGPU::VGPR0, RegState::Undef)
172 .addReg(AMDGPU::VGPR0, RegState::Undef)
173 .addReg(AMDGPU::VGPR0, RegState::Undef)
174 .addReg(AMDGPU::VGPR0, RegState::Undef)
175 .addImm(1) // vm
176 .addImm(0) // compr
177 .addImm(0); // en
178
179 // ... and terminate wavefront.
180 BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM));
181
182 return true;
183 }
184
kill(MachineInstr & MI)185 void SIInsertSkips::kill(MachineInstr &MI) {
186 MachineBasicBlock &MBB = *MI.getParent();
187 DebugLoc DL = MI.getDebugLoc();
188
189 switch (MI.getOpcode()) {
190 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR: {
191 unsigned Opcode = 0;
192
193 // The opcodes are inverted because the inline immediate has to be
194 // the first operand, e.g. from "x < imm" to "imm > x"
195 switch (MI.getOperand(2).getImm()) {
196 case ISD::SETOEQ:
197 case ISD::SETEQ:
198 Opcode = AMDGPU::V_CMPX_EQ_F32_e64;
199 break;
200 case ISD::SETOGT:
201 case ISD::SETGT:
202 Opcode = AMDGPU::V_CMPX_LT_F32_e64;
203 break;
204 case ISD::SETOGE:
205 case ISD::SETGE:
206 Opcode = AMDGPU::V_CMPX_LE_F32_e64;
207 break;
208 case ISD::SETOLT:
209 case ISD::SETLT:
210 Opcode = AMDGPU::V_CMPX_GT_F32_e64;
211 break;
212 case ISD::SETOLE:
213 case ISD::SETLE:
214 Opcode = AMDGPU::V_CMPX_GE_F32_e64;
215 break;
216 case ISD::SETONE:
217 case ISD::SETNE:
218 Opcode = AMDGPU::V_CMPX_LG_F32_e64;
219 break;
220 case ISD::SETO:
221 Opcode = AMDGPU::V_CMPX_O_F32_e64;
222 break;
223 case ISD::SETUO:
224 Opcode = AMDGPU::V_CMPX_U_F32_e64;
225 break;
226 case ISD::SETUEQ:
227 Opcode = AMDGPU::V_CMPX_NLG_F32_e64;
228 break;
229 case ISD::SETUGT:
230 Opcode = AMDGPU::V_CMPX_NGE_F32_e64;
231 break;
232 case ISD::SETUGE:
233 Opcode = AMDGPU::V_CMPX_NGT_F32_e64;
234 break;
235 case ISD::SETULT:
236 Opcode = AMDGPU::V_CMPX_NLE_F32_e64;
237 break;
238 case ISD::SETULE:
239 Opcode = AMDGPU::V_CMPX_NLT_F32_e64;
240 break;
241 case ISD::SETUNE:
242 Opcode = AMDGPU::V_CMPX_NEQ_F32_e64;
243 break;
244 default:
245 llvm_unreachable("invalid ISD:SET cond code");
246 }
247
248 assert(MI.getOperand(0).isReg());
249
250 if (TRI->isVGPR(MBB.getParent()->getRegInfo(),
251 MI.getOperand(0).getReg())) {
252 Opcode = AMDGPU::getVOPe32(Opcode);
253 BuildMI(MBB, &MI, DL, TII->get(Opcode))
254 .add(MI.getOperand(1))
255 .add(MI.getOperand(0));
256 } else {
257 BuildMI(MBB, &MI, DL, TII->get(Opcode))
258 .addReg(AMDGPU::VCC, RegState::Define)
259 .addImm(0) // src0 modifiers
260 .add(MI.getOperand(1))
261 .addImm(0) // src1 modifiers
262 .add(MI.getOperand(0))
263 .addImm(0); // omod
264 }
265 break;
266 }
267 case AMDGPU::SI_KILL_I1_TERMINATOR: {
268 const MachineOperand &Op = MI.getOperand(0);
269 int64_t KillVal = MI.getOperand(1).getImm();
270 assert(KillVal == 0 || KillVal == -1);
271
272 // Kill all threads if Op0 is an immediate and equal to the Kill value.
273 if (Op.isImm()) {
274 int64_t Imm = Op.getImm();
275 assert(Imm == 0 || Imm == -1);
276
277 if (Imm == KillVal)
278 BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC)
279 .addImm(0);
280 break;
281 }
282
283 unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
284 BuildMI(MBB, &MI, DL, TII->get(Opcode), AMDGPU::EXEC)
285 .addReg(AMDGPU::EXEC)
286 .add(Op);
287 break;
288 }
289 default:
290 llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
291 }
292 }
293
insertSkipBlock(MachineBasicBlock & MBB,MachineBasicBlock::iterator I) const294 MachineBasicBlock *SIInsertSkips::insertSkipBlock(
295 MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
296 MachineFunction *MF = MBB.getParent();
297
298 MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
299 MachineFunction::iterator MBBI(MBB);
300 ++MBBI;
301
302 MF->insert(MBBI, SkipBB);
303 MBB.addSuccessor(SkipBB);
304
305 return SkipBB;
306 }
307
308 // Returns true if a branch over the block was inserted.
skipMaskBranch(MachineInstr & MI,MachineBasicBlock & SrcMBB)309 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
310 MachineBasicBlock &SrcMBB) {
311 MachineBasicBlock *DestBB = MI.getOperand(0).getMBB();
312
313 if (!shouldSkip(**SrcMBB.succ_begin(), *DestBB))
314 return false;
315
316 const DebugLoc &DL = MI.getDebugLoc();
317 MachineBasicBlock::iterator InsPt = std::next(MI.getIterator());
318
319 BuildMI(SrcMBB, InsPt, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
320 .addMBB(DestBB);
321
322 return true;
323 }
324
optimizeVccBranch(MachineInstr & MI) const325 bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
326 // Match:
327 // sreg = -1
328 // vcc = S_AND_B64 exec, sreg
329 // S_CBRANCH_VCC[N]Z
330 // =>
331 // S_CBRANCH_EXEC[N]Z
332 bool Changed = false;
333 MachineBasicBlock &MBB = *MI.getParent();
334 const unsigned CondReg = AMDGPU::VCC;
335 const unsigned ExecReg = AMDGPU::EXEC;
336 const unsigned And = AMDGPU::S_AND_B64;
337
338 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
339 E = MBB.rend();
340 bool ReadsCond = false;
341 unsigned Threshold = 5;
342 for (++A ; A != E ; ++A) {
343 if (!--Threshold)
344 return false;
345 if (A->modifiesRegister(ExecReg, TRI))
346 return false;
347 if (A->modifiesRegister(CondReg, TRI)) {
348 if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
349 return false;
350 break;
351 }
352 ReadsCond |= A->readsRegister(CondReg, TRI);
353 }
354 if (A == E)
355 return false;
356
357 MachineOperand &Op1 = A->getOperand(1);
358 MachineOperand &Op2 = A->getOperand(2);
359 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
360 TII->commuteInstruction(*A);
361 Changed = true;
362 }
363 if (Op1.getReg() != ExecReg)
364 return Changed;
365 if (Op2.isImm() && Op2.getImm() != -1)
366 return Changed;
367
368 unsigned SReg = AMDGPU::NoRegister;
369 if (Op2.isReg()) {
370 SReg = Op2.getReg();
371 auto M = std::next(A);
372 bool ReadsSreg = false;
373 for ( ; M != E ; ++M) {
374 if (M->definesRegister(SReg, TRI))
375 break;
376 if (M->modifiesRegister(SReg, TRI))
377 return Changed;
378 ReadsSreg |= M->readsRegister(SReg, TRI);
379 }
380 if (M == E ||
381 !M->isMoveImmediate() ||
382 !M->getOperand(1).isImm() ||
383 M->getOperand(1).getImm() != -1)
384 return Changed;
385 // First if sreg is only used in and instruction fold the immediate
386 // into that and.
387 if (!ReadsSreg && Op2.isKill()) {
388 A->getOperand(2).ChangeToImmediate(-1);
389 M->eraseFromParent();
390 }
391 }
392
393 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
394 MI.killsRegister(CondReg, TRI))
395 A->eraseFromParent();
396
397 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
398 if (SReg == ExecReg) {
399 if (IsVCCZ) {
400 MI.eraseFromParent();
401 return true;
402 }
403 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
404 } else {
405 MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
406 : AMDGPU::S_CBRANCH_EXECNZ));
407 }
408
409 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
410 MI.addImplicitDefUseOperands(*MBB.getParent());
411
412 return true;
413 }
414
runOnMachineFunction(MachineFunction & MF)415 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
416 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
417 TII = ST.getInstrInfo();
418 TRI = &TII->getRegisterInfo();
419 SkipThreshold = SkipThresholdFlag;
420
421 bool HaveKill = false;
422 bool MadeChange = false;
423
424 // Track depth of exec mask, divergent branches.
425 SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
426
427 MachineFunction::iterator NextBB;
428
429 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
430
431 for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
432 BI != BE; BI = NextBB) {
433 NextBB = std::next(BI);
434 MachineBasicBlock &MBB = *BI;
435 bool HaveSkipBlock = false;
436
437 if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
438 // Reached convergence point for last divergent branch.
439 ExecBranchStack.pop_back();
440 }
441
442 if (HaveKill && ExecBranchStack.empty()) {
443 HaveKill = false;
444
445 // TODO: Insert skip if exec is 0?
446 }
447
448 MachineBasicBlock::iterator I, Next;
449 for (I = MBB.begin(); I != MBB.end(); I = Next) {
450 Next = std::next(I);
451
452 MachineInstr &MI = *I;
453
454 switch (MI.getOpcode()) {
455 case AMDGPU::SI_MASK_BRANCH:
456 ExecBranchStack.push_back(MI.getOperand(0).getMBB());
457 MadeChange |= skipMaskBranch(MI, MBB);
458 break;
459
460 case AMDGPU::S_BRANCH:
461 // Optimize out branches to the next block.
462 // FIXME: Shouldn't this be handled by BranchFolding?
463 if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
464 MI.eraseFromParent();
465 } else if (HaveSkipBlock) {
466 // Remove the given unconditional branch when a skip block has been
467 // inserted after the current one and let skip the two instructions
468 // performing the kill if the exec mask is non-zero.
469 MI.eraseFromParent();
470 }
471 break;
472
473 case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
474 case AMDGPU::SI_KILL_I1_TERMINATOR:
475 MadeChange = true;
476 kill(MI);
477
478 if (ExecBranchStack.empty()) {
479 if (NextBB != BE && skipIfDead(MI, *NextBB)) {
480 HaveSkipBlock = true;
481 NextBB = std::next(BI);
482 BE = MF.end();
483 }
484 } else {
485 HaveKill = true;
486 }
487
488 MI.eraseFromParent();
489 break;
490
491 case AMDGPU::SI_RETURN_TO_EPILOG:
492 // FIXME: Should move somewhere else
493 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
494
495 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
496 // because external bytecode will be appended at the end.
497 if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
498 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
499 // the end and jump there.
500 if (!EmptyMBBAtEnd) {
501 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
502 MF.insert(MF.end(), EmptyMBBAtEnd);
503 }
504
505 MBB.addSuccessor(EmptyMBBAtEnd);
506 BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
507 .addMBB(EmptyMBBAtEnd);
508 I->eraseFromParent();
509 }
510 break;
511
512 case AMDGPU::S_CBRANCH_VCCZ:
513 case AMDGPU::S_CBRANCH_VCCNZ:
514 MadeChange |= optimizeVccBranch(MI);
515 break;
516
517 default:
518 break;
519 }
520 }
521 }
522
523 return MadeChange;
524 }
525