1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/CodeGen/MachineFunctionPass.h"
16 
17 #define DEBUG_TYPE "si-shrink-instructions"
18 
19 STATISTIC(NumInstructionsShrunk,
20           "Number of 64-bit instruction reduced to 32-bit.");
21 STATISTIC(NumLiteralConstantsFolded,
22           "Number of literal constants folded into 32-bit instructions.");
23 
24 using namespace llvm;
25 
26 namespace {
27 
28 class SIShrinkInstructions : public MachineFunctionPass {
29   MachineRegisterInfo *MRI;
30   const GCNSubtarget *ST;
31   const SIInstrInfo *TII;
32   const SIRegisterInfo *TRI;
33 
34 public:
35   static char ID;
36 
37 public:
38   SIShrinkInstructions() : MachineFunctionPass(ID) {
39   }
40 
41   bool foldImmediates(MachineInstr &MI, bool TryToCommute = true) const;
42   bool isKImmOperand(const MachineOperand &Src) const;
43   bool isKUImmOperand(const MachineOperand &Src) const;
44   bool isKImmOrKUImmOperand(const MachineOperand &Src, bool &IsUnsigned) const;
45   bool isReverseInlineImm(const MachineOperand &Src, int32_t &ReverseImm) const;
46   void copyExtraImplicitOps(MachineInstr &NewMI, MachineInstr &MI) const;
47   void shrinkScalarCompare(MachineInstr &MI) const;
48   void shrinkMIMG(MachineInstr &MI) const;
49   void shrinkMadFma(MachineInstr &MI) const;
50   bool shrinkScalarLogicOp(MachineInstr &MI) const;
51   bool tryReplaceDeadSDST(MachineInstr &MI) const;
52   bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
53                      Register Reg, unsigned SubReg) const;
54   bool instReadsReg(const MachineInstr *MI, unsigned Reg,
55                     unsigned SubReg) const;
56   bool instModifiesReg(const MachineInstr *MI, unsigned Reg,
57                        unsigned SubReg) const;
58   TargetInstrInfo::RegSubRegPair getSubRegForIndex(Register Reg, unsigned Sub,
59                                                    unsigned I) const;
60   void dropInstructionKeepingImpDefs(MachineInstr &MI) const;
61   MachineInstr *matchSwap(MachineInstr &MovT) const;
62 
63   bool runOnMachineFunction(MachineFunction &MF) override;
64 
65   StringRef getPassName() const override { return "SI Shrink Instructions"; }
66 
67   void getAnalysisUsage(AnalysisUsage &AU) const override {
68     AU.setPreservesCFG();
69     MachineFunctionPass::getAnalysisUsage(AU);
70   }
71 };
72 
73 } // End anonymous namespace.
74 
75 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
76                 "SI Shrink Instructions", false, false)
77 
78 char SIShrinkInstructions::ID = 0;
79 
80 FunctionPass *llvm::createSIShrinkInstructionsPass() {
81   return new SIShrinkInstructions();
82 }
83 
84 /// This function checks \p MI for operands defined by a move immediate
85 /// instruction and then folds the literal constant into the instruction if it
86 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
87 bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
88                                           bool TryToCommute) const {
89   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
90 
91   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
92 
93   // Try to fold Src0
94   MachineOperand &Src0 = MI.getOperand(Src0Idx);
95   if (Src0.isReg()) {
96     Register Reg = Src0.getReg();
97     if (Reg.isVirtual()) {
98       MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
99       if (Def && Def->isMoveImmediate()) {
100         MachineOperand &MovSrc = Def->getOperand(1);
101         bool ConstantFolded = false;
102 
103         if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
104           if (MovSrc.isImm() &&
105               (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
106             Src0.ChangeToImmediate(MovSrc.getImm());
107             ConstantFolded = true;
108           } else if (MovSrc.isFI()) {
109             Src0.ChangeToFrameIndex(MovSrc.getIndex());
110             ConstantFolded = true;
111           } else if (MovSrc.isGlobal()) {
112             Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
113                             MovSrc.getTargetFlags());
114             ConstantFolded = true;
115           }
116         }
117 
118         if (ConstantFolded) {
119           if (MRI->use_nodbg_empty(Reg))
120             Def->eraseFromParent();
121           ++NumLiteralConstantsFolded;
122           return true;
123         }
124       }
125     }
126   }
127 
128   // We have failed to fold src0, so commute the instruction and try again.
129   if (TryToCommute && MI.isCommutable()) {
130     if (TII->commuteInstruction(MI)) {
131       if (foldImmediates(MI, false))
132         return true;
133 
134       // Commute back.
135       TII->commuteInstruction(MI);
136     }
137   }
138 
139   return false;
140 }
141 
142 bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
143   return isInt<16>(Src.getImm()) &&
144     !TII->isInlineConstant(*Src.getParent(),
145                            Src.getParent()->getOperandNo(&Src));
146 }
147 
148 bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
149   return isUInt<16>(Src.getImm()) &&
150     !TII->isInlineConstant(*Src.getParent(),
151                            Src.getParent()->getOperandNo(&Src));
152 }
153 
154 bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
155                                                 bool &IsUnsigned) const {
156   if (isInt<16>(Src.getImm())) {
157     IsUnsigned = false;
158     return !TII->isInlineConstant(Src);
159   }
160 
161   if (isUInt<16>(Src.getImm())) {
162     IsUnsigned = true;
163     return !TII->isInlineConstant(Src);
164   }
165 
166   return false;
167 }
168 
169 /// \returns true if the constant in \p Src should be replaced with a bitreverse
170 /// of an inline immediate.
171 bool SIShrinkInstructions::isReverseInlineImm(const MachineOperand &Src,
172                                               int32_t &ReverseImm) const {
173   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
174     return false;
175 
176   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
177   return ReverseImm >= -16 && ReverseImm <= 64;
178 }
179 
180 /// Copy implicit register operands from specified instruction to this
181 /// instruction that are not part of the instruction definition.
182 void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
183                                                 MachineInstr &MI) const {
184   MachineFunction &MF = *MI.getMF();
185   for (unsigned i = MI.getDesc().getNumOperands() +
186          MI.getDesc().getNumImplicitUses() +
187          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
188        i != e; ++i) {
189     const MachineOperand &MO = MI.getOperand(i);
190     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
191       NewMI.addOperand(MF, MO);
192   }
193 }
194 
195 void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
196   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
197   // get constants on the RHS.
198   if (!MI.getOperand(0).isReg())
199     TII->commuteInstruction(MI, false, 0, 1);
200 
201   // cmpk requires src0 to be a register
202   const MachineOperand &Src0 = MI.getOperand(0);
203   if (!Src0.isReg())
204     return;
205 
206   const MachineOperand &Src1 = MI.getOperand(1);
207   if (!Src1.isImm())
208     return;
209 
210   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
211   if (SOPKOpc == -1)
212     return;
213 
214   // eq/ne is special because the imm16 can be treated as signed or unsigned,
215   // and initially selected to the unsigned versions.
216   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
217     bool HasUImm;
218     if (isKImmOrKUImmOperand(Src1, HasUImm)) {
219       if (!HasUImm) {
220         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
221           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
222       }
223 
224       MI.setDesc(TII->get(SOPKOpc));
225     }
226 
227     return;
228   }
229 
230   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
231 
232   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
233       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
234     MI.setDesc(NewDesc);
235   }
236 }
237 
238 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
239 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
240   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
241   if (!Info)
242     return;
243 
244   uint8_t NewEncoding;
245   switch (Info->MIMGEncoding) {
246   case AMDGPU::MIMGEncGfx10NSA:
247     NewEncoding = AMDGPU::MIMGEncGfx10Default;
248     break;
249   case AMDGPU::MIMGEncGfx11NSA:
250     NewEncoding = AMDGPU::MIMGEncGfx11Default;
251     break;
252   default:
253     return;
254   }
255 
256   int VAddr0Idx =
257       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
258   unsigned NewAddrDwords = Info->VAddrDwords;
259   const TargetRegisterClass *RC;
260 
261   if (Info->VAddrDwords == 2) {
262     RC = &AMDGPU::VReg_64RegClass;
263   } else if (Info->VAddrDwords == 3) {
264     RC = &AMDGPU::VReg_96RegClass;
265   } else if (Info->VAddrDwords == 4) {
266     RC = &AMDGPU::VReg_128RegClass;
267   } else if (Info->VAddrDwords == 5) {
268     RC = &AMDGPU::VReg_160RegClass;
269   } else if (Info->VAddrDwords == 6) {
270     RC = &AMDGPU::VReg_192RegClass;
271   } else if (Info->VAddrDwords == 7) {
272     RC = &AMDGPU::VReg_224RegClass;
273   } else if (Info->VAddrDwords == 8) {
274     RC = &AMDGPU::VReg_256RegClass;
275   } else {
276     RC = &AMDGPU::VReg_512RegClass;
277     NewAddrDwords = 16;
278   }
279 
280   unsigned VgprBase = 0;
281   unsigned NextVgpr = 0;
282   bool IsUndef = true;
283   bool IsKill = NewAddrDwords == Info->VAddrDwords;
284   for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
285     const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
286     unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
287     unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
288     assert(Dwords > 0 && "Un-implemented for less than 32 bit regs");
289 
290     if (Idx == 0) {
291       VgprBase = Vgpr;
292       NextVgpr = Vgpr + Dwords;
293     } else if (Vgpr == NextVgpr) {
294       NextVgpr = Vgpr + Dwords;
295     } else {
296       return;
297     }
298 
299     if (!Op.isUndef())
300       IsUndef = false;
301     if (!Op.isKill())
302       IsKill = false;
303   }
304 
305   if (VgprBase + NewAddrDwords > 256)
306     return;
307 
308   // Further check for implicit tied operands - this may be present if TFE is
309   // enabled
310   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
311   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
312   unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
313   unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
314   int ToUntie = -1;
315   if (TFEVal || LWEVal) {
316     // TFE/LWE is enabled so we need to deal with an implicit tied operand
317     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
318       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
319           MI.getOperand(i).isImplicit()) {
320         // This is the tied operand
321         assert(
322             ToUntie == -1 &&
323             "found more than one tied implicit operand when expecting only 1");
324         ToUntie = i;
325         MI.untieRegOperand(ToUntie);
326       }
327     }
328   }
329 
330   unsigned NewOpcode = AMDGPU::getMIMGOpcode(Info->BaseOpcode, NewEncoding,
331                                              Info->VDataDwords, NewAddrDwords);
332   MI.setDesc(TII->get(NewOpcode));
333   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
334   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
335   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
336 
337   for (int i = 1; i < Info->VAddrOperands; ++i)
338     MI.removeOperand(VAddr0Idx + 1);
339 
340   if (ToUntie >= 0) {
341     MI.tieOperands(
342         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
343         ToUntie - (Info->VAddrOperands - 1));
344   }
345 }
346 
347 // Shrink MAD to MADAK/MADMK and FMA to FMAAK/FMAMK.
348 void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
349   if (!ST->hasVOP3Literal())
350     return;
351 
352   if (TII->hasAnyModifiersSet(MI))
353     return;
354 
355   const unsigned Opcode = MI.getOpcode();
356   MachineOperand &Src0 = *TII->getNamedOperand(MI, AMDGPU::OpName::src0);
357   MachineOperand &Src1 = *TII->getNamedOperand(MI, AMDGPU::OpName::src1);
358   MachineOperand &Src2 = *TII->getNamedOperand(MI, AMDGPU::OpName::src2);
359   unsigned NewOpcode = AMDGPU::INSTRUCTION_LIST_END;
360 
361   bool Swap;
362 
363   // Detect "Dst = VSrc * VGPR + Imm" and convert to AK form.
364   if (Src2.isImm() && !TII->isInlineConstant(Src2)) {
365     if (Src1.isReg() && TRI->isVGPR(*MRI, Src1.getReg()))
366       Swap = false;
367     else if (Src0.isReg() && TRI->isVGPR(*MRI, Src0.getReg()))
368       Swap = true;
369     else
370       return;
371 
372     switch (Opcode) {
373     default:
374       llvm_unreachable("Unexpected mad/fma opcode!");
375     case AMDGPU::V_MAD_F32_e64:
376       NewOpcode = AMDGPU::V_MADAK_F32;
377       break;
378     case AMDGPU::V_FMA_F32_e64:
379       NewOpcode = AMDGPU::V_FMAAK_F32;
380       break;
381     case AMDGPU::V_MAD_F16_e64:
382       NewOpcode = AMDGPU::V_MADAK_F16;
383       break;
384     case AMDGPU::V_FMA_F16_e64:
385       NewOpcode = AMDGPU::V_FMAAK_F16;
386       break;
387     }
388   }
389 
390   // Detect "Dst = VSrc * Imm + VGPR" and convert to MK form.
391   if (Src2.isReg() && TRI->isVGPR(*MRI, Src2.getReg())) {
392     if (Src1.isImm() && !TII->isInlineConstant(Src1))
393       Swap = false;
394     else if (Src0.isImm() && !TII->isInlineConstant(Src0))
395       Swap = true;
396     else
397       return;
398 
399     switch (Opcode) {
400     default:
401       llvm_unreachable("Unexpected mad/fma opcode!");
402     case AMDGPU::V_MAD_F32_e64:
403       NewOpcode = AMDGPU::V_MADMK_F32;
404       break;
405     case AMDGPU::V_FMA_F32_e64:
406       NewOpcode = AMDGPU::V_FMAMK_F32;
407       break;
408     case AMDGPU::V_MAD_F16_e64:
409       NewOpcode = AMDGPU::V_MADMK_F16;
410       break;
411     case AMDGPU::V_FMA_F16_e64:
412       NewOpcode = AMDGPU::V_FMAMK_F16;
413       break;
414     }
415   }
416 
417   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END)
418     return;
419 
420   if (Swap) {
421     // Swap Src0 and Src1 by building a new instruction.
422     BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(NewOpcode),
423             MI.getOperand(0).getReg())
424         .add(Src1)
425         .add(Src0)
426         .add(Src2)
427         .setMIFlags(MI.getFlags());
428     MI.eraseFromParent();
429   } else {
430     TII->removeModOperands(MI);
431     MI.setDesc(TII->get(NewOpcode));
432   }
433 }
434 
435 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
436 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
437 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
438 /// XNOR (as a ^ b == ~(a ^ ~b)).
439 /// \returns true if the caller should continue the machine function iterator
440 bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
441   unsigned Opc = MI.getOpcode();
442   const MachineOperand *Dest = &MI.getOperand(0);
443   MachineOperand *Src0 = &MI.getOperand(1);
444   MachineOperand *Src1 = &MI.getOperand(2);
445   MachineOperand *SrcReg = Src0;
446   MachineOperand *SrcImm = Src1;
447 
448   if (!SrcImm->isImm() ||
449       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST->hasInv2PiInlineImm()))
450     return false;
451 
452   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
453   uint32_t NewImm = 0;
454 
455   if (Opc == AMDGPU::S_AND_B32) {
456     if (isPowerOf2_32(~Imm)) {
457       NewImm = countTrailingOnes(Imm);
458       Opc = AMDGPU::S_BITSET0_B32;
459     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
460       NewImm = ~Imm;
461       Opc = AMDGPU::S_ANDN2_B32;
462     }
463   } else if (Opc == AMDGPU::S_OR_B32) {
464     if (isPowerOf2_32(Imm)) {
465       NewImm = countTrailingZeros(Imm);
466       Opc = AMDGPU::S_BITSET1_B32;
467     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
468       NewImm = ~Imm;
469       Opc = AMDGPU::S_ORN2_B32;
470     }
471   } else if (Opc == AMDGPU::S_XOR_B32) {
472     if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
473       NewImm = ~Imm;
474       Opc = AMDGPU::S_XNOR_B32;
475     }
476   } else {
477     llvm_unreachable("unexpected opcode");
478   }
479 
480   if (NewImm != 0) {
481     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
482       MRI->setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
483       MRI->setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
484       return true;
485     }
486 
487     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
488       const bool IsUndef = SrcReg->isUndef();
489       const bool IsKill = SrcReg->isKill();
490       MI.setDesc(TII->get(Opc));
491       if (Opc == AMDGPU::S_BITSET0_B32 ||
492           Opc == AMDGPU::S_BITSET1_B32) {
493         Src0->ChangeToImmediate(NewImm);
494         // Remove the immediate and add the tied input.
495         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
496                                           /*isImp*/ false, IsKill,
497                                           /*isDead*/ false, IsUndef);
498         MI.tieOperands(0, 2);
499       } else {
500         SrcImm->setImm(NewImm);
501       }
502     }
503   }
504 
505   return false;
506 }
507 
508 // This is the same as MachineInstr::readsRegister/modifiesRegister except
509 // it takes subregs into account.
510 bool SIShrinkInstructions::instAccessReg(
511     iterator_range<MachineInstr::const_mop_iterator> &&R, Register Reg,
512     unsigned SubReg) const {
513   for (const MachineOperand &MO : R) {
514     if (!MO.isReg())
515       continue;
516 
517     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
518       if (TRI->regsOverlap(Reg, MO.getReg()))
519         return true;
520     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
521       LaneBitmask Overlap = TRI->getSubRegIndexLaneMask(SubReg) &
522                             TRI->getSubRegIndexLaneMask(MO.getSubReg());
523       if (Overlap.any())
524         return true;
525     }
526   }
527   return false;
528 }
529 
530 bool SIShrinkInstructions::instReadsReg(const MachineInstr *MI, unsigned Reg,
531                                         unsigned SubReg) const {
532   return instAccessReg(MI->uses(), Reg, SubReg);
533 }
534 
535 bool SIShrinkInstructions::instModifiesReg(const MachineInstr *MI, unsigned Reg,
536                                            unsigned SubReg) const {
537   return instAccessReg(MI->defs(), Reg, SubReg);
538 }
539 
540 TargetInstrInfo::RegSubRegPair
541 SIShrinkInstructions::getSubRegForIndex(Register Reg, unsigned Sub,
542                                         unsigned I) const {
543   if (TRI->getRegSizeInBits(Reg, *MRI) != 32) {
544     if (Reg.isPhysical()) {
545       Reg = TRI->getSubReg(Reg, TRI->getSubRegFromChannel(I));
546     } else {
547       Sub = TRI->getSubRegFromChannel(I + TRI->getChannelFromSubReg(Sub));
548     }
549   }
550   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
551 }
552 
553 void SIShrinkInstructions::dropInstructionKeepingImpDefs(
554     MachineInstr &MI) const {
555   for (unsigned i = MI.getDesc().getNumOperands() +
556          MI.getDesc().getNumImplicitUses() +
557          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
558        i != e; ++i) {
559     const MachineOperand &Op = MI.getOperand(i);
560     if (!Op.isDef())
561       continue;
562     BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
563             TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
564   }
565 
566   MI.eraseFromParent();
567 }
568 
569 // Match:
570 // mov t, x
571 // mov x, y
572 // mov y, t
573 //
574 // =>
575 //
576 // mov t, x (t is potentially dead and move eliminated)
577 // v_swap_b32 x, y
578 //
579 // Returns next valid instruction pointer if was able to create v_swap_b32.
580 //
581 // This shall not be done too early not to prevent possible folding which may
582 // remove matched moves, and this should preferably be done before RA to
583 // release saved registers and also possibly after RA which can insert copies
584 // too.
585 //
586 // This is really just a generic peephole that is not a canonical shrinking,
587 // although requirements match the pass placement and it reduces code size too.
588 MachineInstr *SIShrinkInstructions::matchSwap(MachineInstr &MovT) const {
589   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
590          MovT.getOpcode() == AMDGPU::COPY);
591 
592   Register T = MovT.getOperand(0).getReg();
593   unsigned Tsub = MovT.getOperand(0).getSubReg();
594   MachineOperand &Xop = MovT.getOperand(1);
595 
596   if (!Xop.isReg())
597     return nullptr;
598   Register X = Xop.getReg();
599   unsigned Xsub = Xop.getSubReg();
600 
601   unsigned Size = TII->getOpSize(MovT, 0) / 4;
602 
603   if (!TRI->isVGPR(*MRI, X))
604     return nullptr;
605 
606   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
607     return nullptr;
608 
609   const unsigned SearchLimit = 16;
610   unsigned Count = 0;
611   bool KilledT = false;
612   for (auto Iter = std::next(MovT.getIterator()),
613             E = MovT.getParent()->instr_end();
614        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
615 
616     MachineInstr *MovY = &*Iter;
617     KilledT = MovY->killsRegister(T, TRI);
618 
619     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
620          MovY->getOpcode() != AMDGPU::COPY) ||
621         !MovY->getOperand(1).isReg()        ||
622         MovY->getOperand(1).getReg() != T   ||
623         MovY->getOperand(1).getSubReg() != Tsub ||
624         MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
625       continue;
626 
627     Register Y = MovY->getOperand(0).getReg();
628     unsigned Ysub = MovY->getOperand(0).getSubReg();
629 
630     if (!TRI->isVGPR(*MRI, Y))
631       continue;
632 
633     MachineInstr *MovX = nullptr;
634     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
635          I != IY; ++I) {
636       if (instReadsReg(&*I, X, Xsub) || instModifiesReg(&*I, Y, Ysub) ||
637           instModifiesReg(&*I, T, Tsub) ||
638           (MovX && instModifiesReg(&*I, X, Xsub))) {
639         MovX = nullptr;
640         break;
641       }
642       if (!instReadsReg(&*I, Y, Ysub)) {
643         if (!MovX && instModifiesReg(&*I, X, Xsub)) {
644           MovX = nullptr;
645           break;
646         }
647         continue;
648       }
649       if (MovX ||
650           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
651            I->getOpcode() != AMDGPU::COPY) ||
652           I->getOperand(0).getReg() != X ||
653           I->getOperand(0).getSubReg() != Xsub) {
654         MovX = nullptr;
655         break;
656       }
657       // Implicit use of M0 is an indirect move.
658       if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
659         continue;
660 
661       if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
662         continue;
663 
664       MovX = &*I;
665     }
666 
667     if (!MovX)
668       continue;
669 
670     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
671 
672     for (unsigned I = 0; I < Size; ++I) {
673       TargetInstrInfo::RegSubRegPair X1, Y1;
674       X1 = getSubRegForIndex(X, Xsub, I);
675       Y1 = getSubRegForIndex(Y, Ysub, I);
676       MachineBasicBlock &MBB = *MovT.getParent();
677       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
678                          TII->get(AMDGPU::V_SWAP_B32))
679         .addDef(X1.Reg, 0, X1.SubReg)
680         .addDef(Y1.Reg, 0, Y1.SubReg)
681         .addReg(Y1.Reg, 0, Y1.SubReg)
682         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
683       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
684         // Drop implicit EXEC.
685         MIB->removeOperand(MIB->getNumExplicitOperands());
686         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
687       }
688     }
689     MovX->eraseFromParent();
690     dropInstructionKeepingImpDefs(*MovY);
691     MachineInstr *Next = &*std::next(MovT.getIterator());
692 
693     if (T.isVirtual() && MRI->use_nodbg_empty(T)) {
694       dropInstructionKeepingImpDefs(MovT);
695     } else {
696       Xop.setIsKill(false);
697       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
698         unsigned OpNo = MovT.getNumExplicitOperands() + I;
699         const MachineOperand &Op = MovT.getOperand(OpNo);
700         if (Op.isKill() && TRI->regsOverlap(X, Op.getReg()))
701           MovT.removeOperand(OpNo);
702       }
703     }
704 
705     return Next;
706   }
707 
708   return nullptr;
709 }
710 
711 // If an instruction has dead sdst replace it with NULL register on gfx1030+
712 bool SIShrinkInstructions::tryReplaceDeadSDST(MachineInstr &MI) const {
713   if (!ST->hasGFX10_3Insts())
714     return false;
715 
716   MachineOperand *Op = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
717   if (!Op)
718     return false;
719   Register SDstReg = Op->getReg();
720   if (SDstReg.isPhysical() || !MRI->use_nodbg_empty(SDstReg))
721     return false;
722 
723   Op->setReg(ST->isWave32() ? AMDGPU::SGPR_NULL : AMDGPU::SGPR_NULL64);
724   return true;
725 }
726 
727 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
728   if (skipFunction(MF.getFunction()))
729     return false;
730 
731   MRI = &MF.getRegInfo();
732   ST = &MF.getSubtarget<GCNSubtarget>();
733   TII = ST->getInstrInfo();
734   TRI = &TII->getRegisterInfo();
735 
736   unsigned VCCReg = ST->isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
737 
738   std::vector<unsigned> I1Defs;
739 
740   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
741                                                   BI != BE; ++BI) {
742 
743     MachineBasicBlock &MBB = *BI;
744     MachineBasicBlock::iterator I, Next;
745     for (I = MBB.begin(); I != MBB.end(); I = Next) {
746       Next = std::next(I);
747       MachineInstr &MI = *I;
748 
749       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
750         // If this has a literal constant source that is the same as the
751         // reversed bits of an inline immediate, replace with a bitreverse of
752         // that constant. This saves 4 bytes in the common case of materializing
753         // sign bits.
754 
755         // Test if we are after regalloc. We only want to do this after any
756         // optimizations happen because this will confuse them.
757         // XXX - not exactly a check for post-regalloc run.
758         MachineOperand &Src = MI.getOperand(1);
759         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
760           int32_t ReverseImm;
761           if (isReverseInlineImm(Src, ReverseImm)) {
762             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
763             Src.setImm(ReverseImm);
764             continue;
765           }
766         }
767       }
768 
769       if (ST->hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
770                             MI.getOpcode() == AMDGPU::COPY)) {
771         if (auto *NextMI = matchSwap(MI)) {
772           Next = NextMI->getIterator();
773           continue;
774         }
775       }
776 
777       // Try to use S_ADDK_I32 and S_MULK_I32.
778       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
779           MI.getOpcode() == AMDGPU::S_MUL_I32) {
780         const MachineOperand *Dest = &MI.getOperand(0);
781         MachineOperand *Src0 = &MI.getOperand(1);
782         MachineOperand *Src1 = &MI.getOperand(2);
783 
784         if (!Src0->isReg() && Src1->isReg()) {
785           if (TII->commuteInstruction(MI, false, 1, 2))
786             std::swap(Src0, Src1);
787         }
788 
789         // FIXME: This could work better if hints worked with subregisters. If
790         // we have a vector add of a constant, we usually don't get the correct
791         // allocation due to the subregister usage.
792         if (Dest->getReg().isVirtual() && Src0->isReg()) {
793           MRI->setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
794           MRI->setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
795           continue;
796         }
797 
798         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
799           if (Src1->isImm() && isKImmOperand(*Src1)) {
800             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
801               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
802 
803             MI.setDesc(TII->get(Opc));
804             MI.tieOperands(0, 1);
805           }
806         }
807       }
808 
809       // Try to use s_cmpk_*
810       if (MI.isCompare() && TII->isSOPC(MI)) {
811         shrinkScalarCompare(MI);
812         continue;
813       }
814 
815       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
816       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
817         const MachineOperand &Dst = MI.getOperand(0);
818         MachineOperand &Src = MI.getOperand(1);
819 
820         if (Src.isImm() && Dst.getReg().isPhysical()) {
821           int32_t ReverseImm;
822           if (isKImmOperand(Src))
823             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
824           else if (isReverseInlineImm(Src, ReverseImm)) {
825             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
826             Src.setImm(ReverseImm);
827           }
828         }
829 
830         continue;
831       }
832 
833       // Shrink scalar logic operations.
834       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
835           MI.getOpcode() == AMDGPU::S_OR_B32 ||
836           MI.getOpcode() == AMDGPU::S_XOR_B32) {
837         if (shrinkScalarLogicOp(MI))
838           continue;
839       }
840 
841       if (TII->isMIMG(MI.getOpcode()) &&
842           ST->getGeneration() >= AMDGPUSubtarget::GFX10 &&
843           MF.getProperties().hasProperty(
844               MachineFunctionProperties::Property::NoVRegs)) {
845         shrinkMIMG(MI);
846         continue;
847       }
848 
849       if (!TII->isVOP3(MI))
850         continue;
851 
852       if (MI.getOpcode() == AMDGPU::V_MAD_F32_e64 ||
853           MI.getOpcode() == AMDGPU::V_FMA_F32_e64 ||
854           MI.getOpcode() == AMDGPU::V_MAD_F16_e64 ||
855           MI.getOpcode() == AMDGPU::V_FMA_F16_e64) {
856         shrinkMadFma(MI);
857         continue;
858       }
859 
860       if (!TII->hasVALU32BitEncoding(MI.getOpcode())) {
861         // If there is no chance we will shrink it and use VCC as sdst to get
862         // a 32 bit form try to replace dead sdst with NULL.
863         tryReplaceDeadSDST(MI);
864         continue;
865       }
866 
867       if (!TII->canShrink(MI, *MRI)) {
868         // Try commuting the instruction and see if that enables us to shrink
869         // it.
870         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
871             !TII->canShrink(MI, *MRI)) {
872           tryReplaceDeadSDST(MI);
873           continue;
874         }
875       }
876 
877       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
878 
879       if (TII->isVOPC(Op32)) {
880         MachineOperand &Op0 = MI.getOperand(0);
881         if (Op0.isReg()) {
882           // Exclude VOPCX instructions as these don't explicitly write a
883           // dst.
884           Register DstReg = Op0.getReg();
885           if (DstReg.isVirtual()) {
886             // VOPC instructions can only write to the VCC register. We can't
887             // force them to use VCC here, because this is only one register and
888             // cannot deal with sequences which would require multiple copies of
889             // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
890             //
891             // So, instead of forcing the instruction to write to VCC, we
892             // provide a hint to the register allocator to use VCC and then we
893             // will run this pass again after RA and shrink it if it outputs to
894             // VCC.
895             MRI->setRegAllocationHint(DstReg, 0, VCCReg);
896             continue;
897           }
898           if (DstReg != VCCReg)
899             continue;
900         }
901       }
902 
903       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
904         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
905         // instructions.
906         const MachineOperand *Src2 =
907             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
908         if (!Src2->isReg())
909           continue;
910         Register SReg = Src2->getReg();
911         if (SReg.isVirtual()) {
912           MRI->setRegAllocationHint(SReg, 0, VCCReg);
913           continue;
914         }
915         if (SReg != VCCReg)
916           continue;
917       }
918 
919       // Check for the bool flag output for instructions like V_ADD_I32_e64.
920       const MachineOperand *SDst = TII->getNamedOperand(MI,
921                                                         AMDGPU::OpName::sdst);
922 
923       if (SDst) {
924         bool Next = false;
925 
926         if (SDst->getReg() != VCCReg) {
927           if (SDst->getReg().isVirtual())
928             MRI->setRegAllocationHint(SDst->getReg(), 0, VCCReg);
929           Next = true;
930         }
931 
932         // All of the instructions with carry outs also have an SGPR input in
933         // src2.
934         const MachineOperand *Src2 = TII->getNamedOperand(MI,
935                                                           AMDGPU::OpName::src2);
936         if (Src2 && Src2->getReg() != VCCReg) {
937           if (Src2->getReg().isVirtual())
938             MRI->setRegAllocationHint(Src2->getReg(), 0, VCCReg);
939           Next = true;
940         }
941 
942         if (Next)
943           continue;
944       }
945 
946       // We can shrink this instruction
947       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
948 
949       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
950       ++NumInstructionsShrunk;
951 
952       // Copy extra operands not present in the instruction definition.
953       copyExtraImplicitOps(*Inst32, MI);
954 
955       // Copy deadness from the old explicit vcc def to the new implicit def.
956       if (SDst && SDst->isDead())
957         Inst32->findRegisterDefOperand(VCCReg)->setIsDead();
958 
959       MI.eraseFromParent();
960       foldImmediates(*Inst32);
961 
962       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
963     }
964   }
965   return false;
966 }
967