1 //=======- GCNDPPCombine.cpp - optimization for DPP instructions ---==========//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 // The pass combines V_MOV_B32_dpp instruction with its VALU uses as a DPP src0
9 // operand. If any of the use instruction cannot be combined with the mov the
10 // whole sequence is reverted.
11 //
12 // $old = ...
13 // $dpp_value = V_MOV_B32_dpp $old, $vgpr_to_be_read_from_other_lane,
14 //                            dpp_controls..., $row_mask, $bank_mask, $bound_ctrl
15 // $res = VALU $dpp_value [, src1]
16 //
17 // to
18 //
19 // $res = VALU_DPP $combined_old, $vgpr_to_be_read_from_other_lane, [src1,]
20 //                 dpp_controls..., $row_mask, $bank_mask, $combined_bound_ctrl
21 //
22 // Combining rules :
23 //
24 // if $row_mask and $bank_mask are fully enabled (0xF) and
25 //    $bound_ctrl==DPP_BOUND_ZERO or $old==0
26 // -> $combined_old = undef,
27 //    $combined_bound_ctrl = DPP_BOUND_ZERO
28 //
29 // if the VALU op is binary and
30 //    $bound_ctrl==DPP_BOUND_OFF and
31 //    $old==identity value (immediate) for the VALU op
32 // -> $combined_old = src1,
33 //    $combined_bound_ctrl = DPP_BOUND_OFF
34 //
35 // Otherwise cancel.
36 //
37 // The mov_dpp instruction should reside in the same BB as all its uses
38 //===----------------------------------------------------------------------===//
39 
40 #include "AMDGPU.h"
41 #include "AMDGPUSubtarget.h"
42 #include "SIInstrInfo.h"
43 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
44 #include "llvm/ADT/SmallVector.h"
45 #include "llvm/ADT/Statistic.h"
46 #include "llvm/CodeGen/MachineBasicBlock.h"
47 #include "llvm/CodeGen/MachineFunction.h"
48 #include "llvm/CodeGen/MachineFunctionPass.h"
49 #include "llvm/CodeGen/MachineInstr.h"
50 #include "llvm/CodeGen/MachineInstrBuilder.h"
51 #include "llvm/CodeGen/MachineOperand.h"
52 #include "llvm/CodeGen/MachineRegisterInfo.h"
53 #include "llvm/CodeGen/TargetRegisterInfo.h"
54 #include "llvm/Pass.h"
55 #include <cassert>
56 
57 using namespace llvm;
58 
59 #define DEBUG_TYPE "gcn-dpp-combine"
60 
61 STATISTIC(NumDPPMovsCombined, "Number of DPP moves combined.");
62 
63 namespace {
64 
65 class GCNDPPCombine : public MachineFunctionPass {
66   MachineRegisterInfo *MRI;
67   const SIInstrInfo *TII;
68 
69   using RegSubRegPair = TargetInstrInfo::RegSubRegPair;
70 
71   MachineOperand *getOldOpndValue(MachineOperand &OldOpnd) const;
72 
73   MachineInstr *createDPPInst(MachineInstr &OrigMI,
74                               MachineInstr &MovMI,
75                               RegSubRegPair CombOldVGPR,
76                               MachineOperand *OldOpnd,
77                               bool CombBCZ) const;
78 
79   MachineInstr *createDPPInst(MachineInstr &OrigMI,
80                               MachineInstr &MovMI,
81                               RegSubRegPair CombOldVGPR,
82                               bool CombBCZ) const;
83 
84   bool hasNoImmOrEqual(MachineInstr &MI,
85                        unsigned OpndName,
86                        int64_t Value,
87                        int64_t Mask = -1) const;
88 
89   bool combineDPPMov(MachineInstr &MI) const;
90 
91 public:
92   static char ID;
93 
94   GCNDPPCombine() : MachineFunctionPass(ID) {
95     initializeGCNDPPCombinePass(*PassRegistry::getPassRegistry());
96   }
97 
98   bool runOnMachineFunction(MachineFunction &MF) override;
99 
100   StringRef getPassName() const override { return "GCN DPP Combine"; }
101 
102   void getAnalysisUsage(AnalysisUsage &AU) const override {
103     AU.setPreservesCFG();
104     MachineFunctionPass::getAnalysisUsage(AU);
105   }
106 };
107 
108 } // end anonymous namespace
109 
110 INITIALIZE_PASS(GCNDPPCombine, DEBUG_TYPE, "GCN DPP Combine", false, false)
111 
112 char GCNDPPCombine::ID = 0;
113 
114 char &llvm::GCNDPPCombineID = GCNDPPCombine::ID;
115 
116 FunctionPass *llvm::createGCNDPPCombinePass() {
117   return new GCNDPPCombine();
118 }
119 
120 static int getDPPOp(unsigned Op) {
121   auto DPP32 = AMDGPU::getDPPOp32(Op);
122   if (DPP32 != -1)
123     return DPP32;
124 
125   auto E32 = AMDGPU::getVOPe32(Op);
126   return E32 != -1 ? AMDGPU::getDPPOp32(E32) : -1;
127 }
128 
129 // tracks the register operand definition and returns:
130 //   1. immediate operand used to initialize the register if found
131 //   2. nullptr if the register operand is undef
132 //   3. the operand itself otherwise
133 MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
134   auto *Def = getVRegSubRegDef(getRegSubRegPair(OldOpnd), *MRI);
135   if (!Def)
136     return nullptr;
137 
138   switch(Def->getOpcode()) {
139   default: break;
140   case AMDGPU::IMPLICIT_DEF:
141     return nullptr;
142   case AMDGPU::COPY:
143   case AMDGPU::V_MOV_B32_e32: {
144     auto &Op1 = Def->getOperand(1);
145     if (Op1.isImm())
146       return &Op1;
147     break;
148   }
149   }
150   return &OldOpnd;
151 }
152 
153 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
154                                            MachineInstr &MovMI,
155                                            RegSubRegPair CombOldVGPR,
156                                            bool CombBCZ) const {
157   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
158   assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
159          TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
160 
161   auto OrigOp = OrigMI.getOpcode();
162   auto DPPOp = getDPPOp(OrigOp);
163   if (DPPOp == -1) {
164     LLVM_DEBUG(dbgs() << "  failed: no DPP opcode\n");
165     return nullptr;
166   }
167 
168   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
169                          OrigMI.getDebugLoc(), TII->get(DPPOp));
170   bool Fail = false;
171   do {
172     auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
173     assert(Dst);
174     DPPInst.add(*Dst);
175     int NumOperands = 1;
176 
177     const int OldIdx = AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::old);
178     if (OldIdx != -1) {
179       assert(OldIdx == NumOperands);
180       assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
181       DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
182       ++NumOperands;
183     } else {
184       // TODO: this discards MAC/FMA instructions for now, let's add it later
185       LLVM_DEBUG(dbgs() << "  failed: no old operand in DPP instruction,"
186                            " TBD\n");
187       Fail = true;
188       break;
189     }
190 
191     if (auto *Mod0 = TII->getNamedOperand(OrigMI,
192                                           AMDGPU::OpName::src0_modifiers)) {
193       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
194                                           AMDGPU::OpName::src0_modifiers));
195       assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
196       DPPInst.addImm(Mod0->getImm());
197       ++NumOperands;
198     }
199     auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
200     assert(Src0);
201     if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
202       LLVM_DEBUG(dbgs() << "  failed: src0 is illegal\n");
203       Fail = true;
204       break;
205     }
206     DPPInst.add(*Src0);
207     DPPInst->getOperand(NumOperands).setIsKill(false);
208     ++NumOperands;
209 
210     if (auto *Mod1 = TII->getNamedOperand(OrigMI,
211                                           AMDGPU::OpName::src1_modifiers)) {
212       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
213                                           AMDGPU::OpName::src1_modifiers));
214       assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
215       DPPInst.addImm(Mod1->getImm());
216       ++NumOperands;
217     }
218     if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
219       if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
220         LLVM_DEBUG(dbgs() << "  failed: src1 is illegal\n");
221         Fail = true;
222         break;
223       }
224       DPPInst.add(*Src1);
225       ++NumOperands;
226     }
227 
228     if (auto *Src2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2)) {
229       if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src2)) {
230         LLVM_DEBUG(dbgs() << "  failed: src2 is illegal\n");
231         Fail = true;
232         break;
233       }
234       DPPInst.add(*Src2);
235     }
236 
237     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl));
238     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask));
239     DPPInst.add(*TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask));
240     DPPInst.addImm(CombBCZ ? 1 : 0);
241   } while (false);
242 
243   if (Fail) {
244     DPPInst.getInstr()->eraseFromParent();
245     return nullptr;
246   }
247   LLVM_DEBUG(dbgs() << "  combined:  " << *DPPInst.getInstr());
248   return DPPInst.getInstr();
249 }
250 
251 static bool isIdentityValue(unsigned OrigMIOp, MachineOperand *OldOpnd) {
252   assert(OldOpnd->isImm());
253   switch (OrigMIOp) {
254   default: break;
255   case AMDGPU::V_ADD_U32_e32:
256   case AMDGPU::V_ADD_U32_e64:
257   case AMDGPU::V_ADD_I32_e32:
258   case AMDGPU::V_ADD_I32_e64:
259   case AMDGPU::V_OR_B32_e32:
260   case AMDGPU::V_OR_B32_e64:
261   case AMDGPU::V_SUBREV_U32_e32:
262   case AMDGPU::V_SUBREV_U32_e64:
263   case AMDGPU::V_SUBREV_I32_e32:
264   case AMDGPU::V_SUBREV_I32_e64:
265   case AMDGPU::V_MAX_U32_e32:
266   case AMDGPU::V_MAX_U32_e64:
267   case AMDGPU::V_XOR_B32_e32:
268   case AMDGPU::V_XOR_B32_e64:
269     if (OldOpnd->getImm() == 0)
270       return true;
271     break;
272   case AMDGPU::V_AND_B32_e32:
273   case AMDGPU::V_AND_B32_e64:
274   case AMDGPU::V_MIN_U32_e32:
275   case AMDGPU::V_MIN_U32_e64:
276     if (static_cast<uint32_t>(OldOpnd->getImm()) ==
277         std::numeric_limits<uint32_t>::max())
278       return true;
279     break;
280   case AMDGPU::V_MIN_I32_e32:
281   case AMDGPU::V_MIN_I32_e64:
282     if (static_cast<int32_t>(OldOpnd->getImm()) ==
283         std::numeric_limits<int32_t>::max())
284       return true;
285     break;
286   case AMDGPU::V_MAX_I32_e32:
287   case AMDGPU::V_MAX_I32_e64:
288     if (static_cast<int32_t>(OldOpnd->getImm()) ==
289         std::numeric_limits<int32_t>::min())
290       return true;
291     break;
292   case AMDGPU::V_MUL_I32_I24_e32:
293   case AMDGPU::V_MUL_I32_I24_e64:
294   case AMDGPU::V_MUL_U32_U24_e32:
295   case AMDGPU::V_MUL_U32_U24_e64:
296     if (OldOpnd->getImm() == 1)
297       return true;
298     break;
299   }
300   return false;
301 }
302 
303 MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
304                                            MachineInstr &MovMI,
305                                            RegSubRegPair CombOldVGPR,
306                                            MachineOperand *OldOpndValue,
307                                            bool CombBCZ) const {
308   assert(CombOldVGPR.Reg);
309   if (!CombBCZ && OldOpndValue && OldOpndValue->isImm()) {
310     auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
311     if (!Src1 || !Src1->isReg()) {
312       LLVM_DEBUG(dbgs() << "  failed: no src1 or it isn't a register\n");
313       return nullptr;
314     }
315     if (!isIdentityValue(OrigMI.getOpcode(), OldOpndValue)) {
316       LLVM_DEBUG(dbgs() << "  failed: old immediate isn't an identity\n");
317       return nullptr;
318     }
319     CombOldVGPR = getRegSubRegPair(*Src1);
320     if (!isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI)) {
321       LLVM_DEBUG(dbgs() << "  failed: src1 isn't a VGPR32 register\n");
322       return nullptr;
323     }
324   }
325   return createDPPInst(OrigMI, MovMI, CombOldVGPR, CombBCZ);
326 }
327 
328 // returns true if MI doesn't have OpndName immediate operand or the
329 // operand has Value
330 bool GCNDPPCombine::hasNoImmOrEqual(MachineInstr &MI, unsigned OpndName,
331                                     int64_t Value, int64_t Mask) const {
332   auto *Imm = TII->getNamedOperand(MI, OpndName);
333   if (!Imm)
334     return true;
335 
336   assert(Imm->isImm());
337   return (Imm->getImm() & Mask) == Value;
338 }
339 
340 bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
341   assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
342   LLVM_DEBUG(dbgs() << "\nDPP combine: " << MovMI);
343 
344   auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
345   assert(DstOpnd && DstOpnd->isReg());
346   auto DPPMovReg = DstOpnd->getReg();
347   if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
348     LLVM_DEBUG(dbgs() << "  failed: EXEC mask should remain the same"
349                          " for all uses\n");
350     return false;
351   }
352 
353   auto *RowMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::row_mask);
354   assert(RowMaskOpnd && RowMaskOpnd->isImm());
355   auto *BankMaskOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bank_mask);
356   assert(BankMaskOpnd && BankMaskOpnd->isImm());
357   const bool MaskAllLanes = RowMaskOpnd->getImm() == 0xF &&
358                             BankMaskOpnd->getImm() == 0xF;
359 
360   auto *BCZOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::bound_ctrl);
361   assert(BCZOpnd && BCZOpnd->isImm());
362   bool BoundCtrlZero = BCZOpnd->getImm();
363 
364   auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
365   assert(OldOpnd && OldOpnd->isReg());
366 
367   auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
368   // OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
369   // We could use: assert(!OldOpndValue || OldOpndValue->isImm())
370   // but the third option is used to distinguish undef from non-immediate
371   // to reuse IMPLICIT_DEF instruction later
372   assert(!OldOpndValue || OldOpndValue->isImm() || OldOpndValue == OldOpnd);
373 
374   bool CombBCZ = false;
375 
376   if (MaskAllLanes && BoundCtrlZero) { // [1]
377     CombBCZ = true;
378   } else {
379     if (!OldOpndValue || !OldOpndValue->isImm()) {
380       LLVM_DEBUG(dbgs() << "  failed: the DPP mov isn't combinable\n");
381       return false;
382     }
383 
384     if (OldOpndValue->getParent()->getParent() != MovMI.getParent()) {
385       LLVM_DEBUG(dbgs() <<
386         "  failed: old reg def and mov should be in the same BB\n");
387       return false;
388     }
389 
390     if (OldOpndValue->getImm() == 0) {
391       if (MaskAllLanes) {
392         assert(!BoundCtrlZero); // by check [1]
393         CombBCZ = true;
394       }
395     } else if (BoundCtrlZero) {
396       assert(!MaskAllLanes); // by check [1]
397       LLVM_DEBUG(dbgs() <<
398         "  failed: old!=0 and bctrl:0 and not all lanes isn't combinable\n");
399       return false;
400     }
401   }
402 
403   LLVM_DEBUG(dbgs() << "  old=";
404     if (!OldOpndValue)
405       dbgs() << "undef";
406     else
407       dbgs() << *OldOpndValue;
408     dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
409 
410   SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
411   auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
412   // try to reuse previous old reg if its undefined (IMPLICIT_DEF)
413   if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
414     CombOldVGPR = RegSubRegPair(
415       MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass));
416     auto UndefInst = BuildMI(*MovMI.getParent(), MovMI, MovMI.getDebugLoc(),
417                              TII->get(AMDGPU::IMPLICIT_DEF), CombOldVGPR.Reg);
418     DPPMIs.push_back(UndefInst.getInstr());
419   }
420 
421   OrigMIs.push_back(&MovMI);
422   bool Rollback = true;
423   for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
424     Rollback = true;
425 
426     auto &OrigMI = *Use.getParent();
427     LLVM_DEBUG(dbgs() << "  try: " << OrigMI);
428 
429     auto OrigOp = OrigMI.getOpcode();
430     if (TII->isVOP3(OrigOp)) {
431       if (!TII->hasVALU32BitEncoding(OrigOp)) {
432         LLVM_DEBUG(dbgs() << "  failed: VOP3 hasn't e32 equivalent\n");
433         break;
434       }
435       // check if other than abs|neg modifiers are set (opsel for example)
436       const int64_t Mask = ~(SISrcMods::ABS | SISrcMods::NEG);
437       if (!hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
438           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
439           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::clamp, 0) ||
440           !hasNoImmOrEqual(OrigMI, AMDGPU::OpName::omod, 0)) {
441         LLVM_DEBUG(dbgs() << "  failed: VOP3 has non-default modifiers\n");
442         break;
443       }
444     } else if (!TII->isVOP1(OrigOp) && !TII->isVOP2(OrigOp)) {
445       LLVM_DEBUG(dbgs() << "  failed: not VOP1/2/3\n");
446       break;
447     }
448 
449     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
450     if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
451       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
452                                         OldOpndValue, CombBCZ)) {
453         DPPMIs.push_back(DPPInst);
454         Rollback = false;
455       }
456     } else if (OrigMI.isCommutable() &&
457                &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
458       auto *BB = OrigMI.getParent();
459       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
460       BB->insert(OrigMI, NewMI);
461       if (TII->commuteInstruction(*NewMI)) {
462         LLVM_DEBUG(dbgs() << "  commuted:  " << *NewMI);
463         if (auto *DPPInst = createDPPInst(*NewMI, MovMI, CombOldVGPR,
464                                           OldOpndValue, CombBCZ)) {
465           DPPMIs.push_back(DPPInst);
466           Rollback = false;
467         }
468       } else
469         LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
470       NewMI->eraseFromParent();
471     } else
472       LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
473     if (Rollback)
474       break;
475     OrigMIs.push_back(&OrigMI);
476   }
477 
478   for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
479     MI->eraseFromParent();
480 
481   return !Rollback;
482 }
483 
484 bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
485   auto &ST = MF.getSubtarget<GCNSubtarget>();
486   if (!ST.hasDPP() || skipFunction(MF.getFunction()))
487     return false;
488 
489   MRI = &MF.getRegInfo();
490   TII = ST.getInstrInfo();
491 
492   assert(MRI->isSSA() && "Must be run on SSA");
493 
494   bool Changed = false;
495   for (auto &MBB : MF) {
496     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
497       auto &MI = *I++;
498       if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
499         Changed = true;
500         ++NumDPPMovsCombined;
501       }
502     }
503   }
504   return Changed;
505 }
506