1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
21 
22 using namespace llvm;
23 
24 #define DEBUG_TYPE "si-pre-emit-peephole"
25 
26 namespace {
27 
28 class SIPreEmitPeephole : public MachineFunctionPass {
29 private:
30   const SIInstrInfo *TII = nullptr;
31   const SIRegisterInfo *TRI = nullptr;
32 
33   bool optimizeVccBranch(MachineInstr &MI) const;
34   bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
35 
36 public:
37   static char ID;
38 
39   SIPreEmitPeephole() : MachineFunctionPass(ID) {
40     initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
41   }
42 
43   bool runOnMachineFunction(MachineFunction &MF) override;
44 };
45 
46 } // End anonymous namespace.
47 
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49                 "SI peephole optimizations", false, false)
50 
51 char SIPreEmitPeephole::ID = 0;
52 
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
54 
55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
56   // Match:
57   // sreg = -1 or 0
58   // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
59   // S_CBRANCH_VCC[N]Z
60   // =>
61   // S_CBRANCH_EXEC[N]Z
62   // We end up with this pattern sometimes after basic block placement.
63   // It happens while combining a block which assigns -1 or 0 to a saved mask
64   // and another block which consumes that saved mask and then a branch.
65   bool Changed = false;
66   MachineBasicBlock &MBB = *MI.getParent();
67   const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68   const bool IsWave32 = ST.isWave32();
69   const unsigned CondReg = TRI->getVCC();
70   const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71   const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72   const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
73 
74   MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
75                                       E = MBB.rend();
76   bool ReadsCond = false;
77   unsigned Threshold = 5;
78   for (++A; A != E; ++A) {
79     if (!--Threshold)
80       return false;
81     if (A->modifiesRegister(ExecReg, TRI))
82       return false;
83     if (A->modifiesRegister(CondReg, TRI)) {
84       if (!A->definesRegister(CondReg, TRI) ||
85           (A->getOpcode() != And && A->getOpcode() != AndN2))
86         return false;
87       break;
88     }
89     ReadsCond |= A->readsRegister(CondReg, TRI);
90   }
91   if (A == E)
92     return false;
93 
94   MachineOperand &Op1 = A->getOperand(1);
95   MachineOperand &Op2 = A->getOperand(2);
96   if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
97     TII->commuteInstruction(*A);
98     Changed = true;
99   }
100   if (Op1.getReg() != ExecReg)
101     return Changed;
102   if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
103     return Changed;
104 
105   int64_t MaskValue = 0;
106   Register SReg;
107   if (Op2.isReg()) {
108     SReg = Op2.getReg();
109     auto M = std::next(A);
110     bool ReadsSreg = false;
111     for (; M != E; ++M) {
112       if (M->definesRegister(SReg, TRI))
113         break;
114       if (M->modifiesRegister(SReg, TRI))
115         return Changed;
116       ReadsSreg |= M->readsRegister(SReg, TRI);
117     }
118     if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
119         (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
120       return Changed;
121     MaskValue = M->getOperand(1).getImm();
122     // First if sreg is only used in the AND instruction fold the immediate
123     // into into the AND.
124     if (!ReadsSreg && Op2.isKill()) {
125       A->getOperand(2).ChangeToImmediate(MaskValue);
126       M->eraseFromParent();
127     }
128   } else if (Op2.isImm()) {
129     MaskValue = Op2.getImm();
130   } else {
131     llvm_unreachable("Op2 must be register or immediate");
132   }
133 
134   // Invert mask for s_andn2
135   assert(MaskValue == 0 || MaskValue == -1);
136   if (A->getOpcode() == AndN2)
137     MaskValue = ~MaskValue;
138 
139   if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
140       MI.killsRegister(CondReg, TRI))
141     A->eraseFromParent();
142 
143   bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
144   if (SReg == ExecReg) {
145     // EXEC is updated directly
146     if (IsVCCZ) {
147       MI.eraseFromParent();
148       return true;
149     }
150     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
151   } else if (IsVCCZ && MaskValue == 0) {
152     // Will always branch
153     // Remove all succesors shadowed by new unconditional branch
154     MachineBasicBlock *Parent = MI.getParent();
155     SmallVector<MachineInstr *, 4> ToRemove;
156     bool Found = false;
157     for (MachineInstr &Term : Parent->terminators()) {
158       if (Found) {
159         if (Term.isBranch())
160           ToRemove.push_back(&Term);
161       } else {
162         Found = Term.isIdenticalTo(MI);
163       }
164     }
165     assert(Found && "conditional branch is not terminator");
166     for (auto BranchMI : ToRemove) {
167       MachineOperand &Dst = BranchMI->getOperand(0);
168       assert(Dst.isMBB() && "destination is not basic block");
169       Parent->removeSuccessor(Dst.getMBB());
170       BranchMI->eraseFromParent();
171     }
172 
173     if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
174       Parent->removeSuccessor(Succ);
175     }
176 
177     // Rewrite to unconditional branch
178     MI.setDesc(TII->get(AMDGPU::S_BRANCH));
179   } else if (!IsVCCZ && MaskValue == 0) {
180     // Will never branch
181     MachineOperand &Dst = MI.getOperand(0);
182     assert(Dst.isMBB() && "destination is not basic block");
183     MI.getParent()->removeSuccessor(Dst.getMBB());
184     MI.eraseFromParent();
185     return true;
186   } else if (MaskValue == -1) {
187     // Depends only on EXEC
188     MI.setDesc(
189         TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
190   }
191 
192   MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
193   MI.addImplicitDefUseOperands(*MBB.getParent());
194 
195   return true;
196 }
197 
198 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
199                                        MachineInstr &MI) const {
200   MachineBasicBlock &MBB = *MI.getParent();
201   const MachineFunction &MF = *MBB.getParent();
202   const MachineRegisterInfo &MRI = MF.getRegInfo();
203   MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
204   Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
205   SmallVector<MachineInstr *, 4> ToRemove;
206   bool IdxOn = true;
207 
208   if (!MI.isIdenticalTo(First))
209     return false;
210 
211   // Scan back to find an identical S_SET_GPR_IDX_ON
212   for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
213        E = MI.getIterator(); I != E; ++I) {
214     switch (I->getOpcode()) {
215     case AMDGPU::S_SET_GPR_IDX_MODE:
216       return false;
217     case AMDGPU::S_SET_GPR_IDX_OFF:
218       IdxOn = false;
219       ToRemove.push_back(&*I);
220       break;
221     default:
222       if (I->modifiesRegister(AMDGPU::M0, TRI))
223         return false;
224       if (IdxReg && I->modifiesRegister(IdxReg, TRI))
225         return false;
226       if (llvm::any_of(I->operands(),
227                        [&MRI, this](const MachineOperand &MO) {
228                          return MO.isReg() &&
229                                 TRI->isVectorRegister(MRI, MO.getReg());
230                        })) {
231         // The only exception allowed here is another indirect vector move
232         // with the same mode.
233         if (!IdxOn ||
234             !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
235                I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
236               I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
237           return false;
238       }
239     }
240   }
241 
242   MI.eraseFromParent();
243   for (MachineInstr *RI : ToRemove)
244     RI->eraseFromParent();
245   return true;
246 }
247 
248 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
249   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
250   TII = ST.getInstrInfo();
251   TRI = &TII->getRegisterInfo();
252   MachineBasicBlock *EmptyMBBAtEnd = nullptr;
253   bool Changed = false;
254 
255   for (MachineBasicBlock &MBB : MF) {
256     MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
257     if (MBBE != MBB.end()) {
258       MachineInstr &MI = *MBBE;
259       switch (MI.getOpcode()) {
260       case AMDGPU::S_CBRANCH_VCCZ:
261       case AMDGPU::S_CBRANCH_VCCNZ:
262         Changed |= optimizeVccBranch(MI);
263         continue;
264       case AMDGPU::SI_RETURN_TO_EPILOG:
265         // FIXME: This is not an optimization and should be
266         // moved somewhere else.
267         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
268 
269         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
270         // because external bytecode will be appended at the end.
271         if (&MBB != &MF.back() || &MI != &MBB.back()) {
272           // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
273           // at the end and jump there.
274           if (!EmptyMBBAtEnd) {
275             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
276             MF.insert(MF.end(), EmptyMBBAtEnd);
277           }
278 
279           MBB.addSuccessor(EmptyMBBAtEnd);
280           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
281               .addMBB(EmptyMBBAtEnd);
282           MI.eraseFromParent();
283           MBBE = MBB.getFirstTerminator();
284         }
285         break;
286       default:
287         break;
288       }
289     }
290 
291     if (!ST.hasVGPRIndexMode())
292       continue;
293 
294     MachineInstr *SetGPRMI = nullptr;
295     const unsigned Threshold = 20;
296     unsigned Count = 0;
297     // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
298     // second is not needed. Do expensive checks in the optimizeSetGPR()
299     // and limit the distance to 20 instructions for compile time purposes.
300     for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
301       MachineInstr &MI = *MBBI;
302       ++MBBI;
303 
304       if (Count == Threshold)
305         SetGPRMI = nullptr;
306       else
307         ++Count;
308 
309       if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
310         continue;
311 
312       Count = 0;
313       if (!SetGPRMI) {
314         SetGPRMI = &MI;
315         continue;
316       }
317 
318       if (optimizeSetGPR(*SetGPRMI, MI))
319         Changed = true;
320       else
321         SetGPRMI = &MI;
322     }
323   }
324 
325   return Changed;
326 }
327