1 //===-- SIPreEmitPeephole.cpp ------------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file
10 /// This pass performs the peephole optimizations before code emission.
11 ///
12 //===----------------------------------------------------------------------===//
13
14 #include "AMDGPU.h"
15 #include "AMDGPUSubtarget.h"
16 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
17 #include "SIInstrInfo.h"
18 #include "SIMachineFunctionInfo.h"
19 #include "llvm/CodeGen/MachineFunctionPass.h"
20 #include "llvm/Support/CommandLine.h"
21
22 using namespace llvm;
23
24 #define DEBUG_TYPE "si-pre-emit-peephole"
25
26 namespace {
27
28 class SIPreEmitPeephole : public MachineFunctionPass {
29 private:
30 const SIInstrInfo *TII = nullptr;
31 const SIRegisterInfo *TRI = nullptr;
32
33 bool optimizeVccBranch(MachineInstr &MI) const;
34 bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
35
36 public:
37 static char ID;
38
SIPreEmitPeephole()39 SIPreEmitPeephole() : MachineFunctionPass(ID) {
40 initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
41 }
42
43 bool runOnMachineFunction(MachineFunction &MF) override;
44 };
45
46 } // End anonymous namespace.
47
48 INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
49 "SI peephole optimizations", false, false)
50
51 char SIPreEmitPeephole::ID = 0;
52
53 char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
54
optimizeVccBranch(MachineInstr & MI) const55 bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
56 // Match:
57 // sreg = -1 or 0
58 // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
59 // S_CBRANCH_VCC[N]Z
60 // =>
61 // S_CBRANCH_EXEC[N]Z
62 // We end up with this pattern sometimes after basic block placement.
63 // It happens while combining a block which assigns -1 or 0 to a saved mask
64 // and another block which consumes that saved mask and then a branch.
65 bool Changed = false;
66 MachineBasicBlock &MBB = *MI.getParent();
67 const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
68 const bool IsWave32 = ST.isWave32();
69 const unsigned CondReg = TRI->getVCC();
70 const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
71 const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
72 const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
73
74 MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
75 E = MBB.rend();
76 bool ReadsCond = false;
77 unsigned Threshold = 5;
78 for (++A; A != E; ++A) {
79 if (!--Threshold)
80 return false;
81 if (A->modifiesRegister(ExecReg, TRI))
82 return false;
83 if (A->modifiesRegister(CondReg, TRI)) {
84 if (!A->definesRegister(CondReg, TRI) ||
85 (A->getOpcode() != And && A->getOpcode() != AndN2))
86 return false;
87 break;
88 }
89 ReadsCond |= A->readsRegister(CondReg, TRI);
90 }
91 if (A == E)
92 return false;
93
94 MachineOperand &Op1 = A->getOperand(1);
95 MachineOperand &Op2 = A->getOperand(2);
96 if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
97 TII->commuteInstruction(*A);
98 Changed = true;
99 }
100 if (Op1.getReg() != ExecReg)
101 return Changed;
102 if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
103 return Changed;
104
105 int64_t MaskValue = 0;
106 Register SReg;
107 if (Op2.isReg()) {
108 SReg = Op2.getReg();
109 auto M = std::next(A);
110 bool ReadsSreg = false;
111 for (; M != E; ++M) {
112 if (M->definesRegister(SReg, TRI))
113 break;
114 if (M->modifiesRegister(SReg, TRI))
115 return Changed;
116 ReadsSreg |= M->readsRegister(SReg, TRI);
117 }
118 if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
119 (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
120 return Changed;
121 MaskValue = M->getOperand(1).getImm();
122 // First if sreg is only used in the AND instruction fold the immediate
123 // into into the AND.
124 if (!ReadsSreg && Op2.isKill()) {
125 A->getOperand(2).ChangeToImmediate(MaskValue);
126 M->eraseFromParent();
127 }
128 } else if (Op2.isImm()) {
129 MaskValue = Op2.getImm();
130 } else {
131 llvm_unreachable("Op2 must be register or immediate");
132 }
133
134 // Invert mask for s_andn2
135 assert(MaskValue == 0 || MaskValue == -1);
136 if (A->getOpcode() == AndN2)
137 MaskValue = ~MaskValue;
138
139 if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
140 MI.killsRegister(CondReg, TRI))
141 A->eraseFromParent();
142
143 bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
144 if (SReg == ExecReg) {
145 // EXEC is updated directly
146 if (IsVCCZ) {
147 MI.eraseFromParent();
148 return true;
149 }
150 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
151 } else if (IsVCCZ && MaskValue == 0) {
152 // Will always branch
153 // Remove all succesors shadowed by new unconditional branch
154 MachineBasicBlock *Parent = MI.getParent();
155 SmallVector<MachineInstr *, 4> ToRemove;
156 bool Found = false;
157 for (MachineInstr &Term : Parent->terminators()) {
158 if (Found) {
159 if (Term.isBranch())
160 ToRemove.push_back(&Term);
161 } else {
162 Found = Term.isIdenticalTo(MI);
163 }
164 }
165 assert(Found && "conditional branch is not terminator");
166 for (auto BranchMI : ToRemove) {
167 MachineOperand &Dst = BranchMI->getOperand(0);
168 assert(Dst.isMBB() && "destination is not basic block");
169 Parent->removeSuccessor(Dst.getMBB());
170 BranchMI->eraseFromParent();
171 }
172
173 if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
174 Parent->removeSuccessor(Succ);
175 }
176
177 // Rewrite to unconditional branch
178 MI.setDesc(TII->get(AMDGPU::S_BRANCH));
179 } else if (!IsVCCZ && MaskValue == 0) {
180 // Will never branch
181 MachineOperand &Dst = MI.getOperand(0);
182 assert(Dst.isMBB() && "destination is not basic block");
183 MI.getParent()->removeSuccessor(Dst.getMBB());
184 MI.eraseFromParent();
185 return true;
186 } else if (MaskValue == -1) {
187 // Depends only on EXEC
188 MI.setDesc(
189 TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
190 }
191
192 MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
193 MI.addImplicitDefUseOperands(*MBB.getParent());
194
195 return true;
196 }
197
optimizeSetGPR(MachineInstr & First,MachineInstr & MI) const198 bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
199 MachineInstr &MI) const {
200 MachineBasicBlock &MBB = *MI.getParent();
201 const MachineFunction &MF = *MBB.getParent();
202 const MachineRegisterInfo &MRI = MF.getRegInfo();
203 MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
204 Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
205 SmallVector<MachineInstr *, 4> ToRemove;
206 bool IdxOn = true;
207
208 if (!MI.isIdenticalTo(First))
209 return false;
210
211 // Scan back to find an identical S_SET_GPR_IDX_ON
212 for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
213 E = MI.getIterator(); I != E; ++I) {
214 switch (I->getOpcode()) {
215 case AMDGPU::S_SET_GPR_IDX_MODE:
216 return false;
217 case AMDGPU::S_SET_GPR_IDX_OFF:
218 IdxOn = false;
219 ToRemove.push_back(&*I);
220 break;
221 default:
222 if (I->modifiesRegister(AMDGPU::M0, TRI))
223 return false;
224 if (IdxReg && I->modifiesRegister(IdxReg, TRI))
225 return false;
226 if (llvm::any_of(I->operands(),
227 [&MRI, this](const MachineOperand &MO) {
228 return MO.isReg() &&
229 TRI->isVectorRegister(MRI, MO.getReg());
230 })) {
231 // The only exception allowed here is another indirect vector move
232 // with the same mode.
233 if (!IdxOn ||
234 !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
235 I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
236 I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
237 return false;
238 }
239 }
240 }
241
242 MI.eraseFromParent();
243 for (MachineInstr *RI : ToRemove)
244 RI->eraseFromParent();
245 return true;
246 }
247
runOnMachineFunction(MachineFunction & MF)248 bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
249 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
250 TII = ST.getInstrInfo();
251 TRI = &TII->getRegisterInfo();
252 MachineBasicBlock *EmptyMBBAtEnd = nullptr;
253 bool Changed = false;
254
255 for (MachineBasicBlock &MBB : MF) {
256 MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
257 if (MBBE != MBB.end()) {
258 MachineInstr &MI = *MBBE;
259 switch (MI.getOpcode()) {
260 case AMDGPU::S_CBRANCH_VCCZ:
261 case AMDGPU::S_CBRANCH_VCCNZ:
262 Changed |= optimizeVccBranch(MI);
263 continue;
264 case AMDGPU::SI_RETURN_TO_EPILOG:
265 // FIXME: This is not an optimization and should be
266 // moved somewhere else.
267 assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
268
269 // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
270 // because external bytecode will be appended at the end.
271 if (&MBB != &MF.back() || &MI != &MBB.back()) {
272 // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
273 // at the end and jump there.
274 if (!EmptyMBBAtEnd) {
275 EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
276 MF.insert(MF.end(), EmptyMBBAtEnd);
277 }
278
279 MBB.addSuccessor(EmptyMBBAtEnd);
280 BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
281 .addMBB(EmptyMBBAtEnd);
282 MI.eraseFromParent();
283 MBBE = MBB.getFirstTerminator();
284 }
285 break;
286 default:
287 break;
288 }
289 }
290
291 if (!ST.hasVGPRIndexMode())
292 continue;
293
294 MachineInstr *SetGPRMI = nullptr;
295 const unsigned Threshold = 20;
296 unsigned Count = 0;
297 // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
298 // second is not needed. Do expensive checks in the optimizeSetGPR()
299 // and limit the distance to 20 instructions for compile time purposes.
300 for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
301 MachineInstr &MI = *MBBI;
302 ++MBBI;
303
304 if (Count == Threshold)
305 SetGPRMI = nullptr;
306 else
307 ++Count;
308
309 if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
310 continue;
311
312 Count = 0;
313 if (!SetGPRMI) {
314 SetGPRMI = &MI;
315 continue;
316 }
317
318 if (optimizeSetGPR(*SetGPRMI, MI))
319 Changed = true;
320 else
321 SetGPRMI = &MI;
322 }
323 }
324
325 return Changed;
326 }
327