1 //===-- SIShrinkInstructions.cpp - Shrink Instructions --------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 /// The pass tries to use the 32-bit encoding for instructions when possible.
8 //===----------------------------------------------------------------------===//
9 //
10 
11 #include "AMDGPU.h"
12 #include "GCNSubtarget.h"
13 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
14 #include "llvm/ADT/Statistic.h"
15 #include "llvm/CodeGen/MachineFunctionPass.h"
16 
17 #define DEBUG_TYPE "si-shrink-instructions"
18 
19 STATISTIC(NumInstructionsShrunk,
20           "Number of 64-bit instruction reduced to 32-bit.");
21 STATISTIC(NumLiteralConstantsFolded,
22           "Number of literal constants folded into 32-bit instructions.");
23 
24 using namespace llvm;
25 
26 namespace {
27 
28 class SIShrinkInstructions : public MachineFunctionPass {
29 public:
30   static char ID;
31 
32   void shrinkMIMG(MachineInstr &MI);
33 
34 public:
35   SIShrinkInstructions() : MachineFunctionPass(ID) {
36   }
37 
38   bool runOnMachineFunction(MachineFunction &MF) override;
39 
40   StringRef getPassName() const override { return "SI Shrink Instructions"; }
41 
42   void getAnalysisUsage(AnalysisUsage &AU) const override {
43     AU.setPreservesCFG();
44     MachineFunctionPass::getAnalysisUsage(AU);
45   }
46 };
47 
48 } // End anonymous namespace.
49 
50 INITIALIZE_PASS(SIShrinkInstructions, DEBUG_TYPE,
51                 "SI Shrink Instructions", false, false)
52 
53 char SIShrinkInstructions::ID = 0;
54 
55 FunctionPass *llvm::createSIShrinkInstructionsPass() {
56   return new SIShrinkInstructions();
57 }
58 
59 /// This function checks \p MI for operands defined by a move immediate
60 /// instruction and then folds the literal constant into the instruction if it
61 /// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions.
62 static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
63                            MachineRegisterInfo &MRI, bool TryToCommute = true) {
64   assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
65 
66   int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
67 
68   // Try to fold Src0
69   MachineOperand &Src0 = MI.getOperand(Src0Idx);
70   if (Src0.isReg()) {
71     Register Reg = Src0.getReg();
72     if (Reg.isVirtual() && MRI.hasOneUse(Reg)) {
73       MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
74       if (Def && Def->isMoveImmediate()) {
75         MachineOperand &MovSrc = Def->getOperand(1);
76         bool ConstantFolded = false;
77 
78         if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
79           if (MovSrc.isImm() &&
80               (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
81             Src0.ChangeToImmediate(MovSrc.getImm());
82             ConstantFolded = true;
83           } else if (MovSrc.isFI()) {
84             Src0.ChangeToFrameIndex(MovSrc.getIndex());
85             ConstantFolded = true;
86           } else if (MovSrc.isGlobal()) {
87             Src0.ChangeToGA(MovSrc.getGlobal(), MovSrc.getOffset(),
88                             MovSrc.getTargetFlags());
89             ConstantFolded = true;
90           }
91         }
92 
93         if (ConstantFolded) {
94           assert(MRI.use_empty(Reg));
95           Def->eraseFromParent();
96           ++NumLiteralConstantsFolded;
97           return true;
98         }
99       }
100     }
101   }
102 
103   // We have failed to fold src0, so commute the instruction and try again.
104   if (TryToCommute && MI.isCommutable()) {
105     if (TII->commuteInstruction(MI)) {
106       if (foldImmediates(MI, TII, MRI, false))
107         return true;
108 
109       // Commute back.
110       TII->commuteInstruction(MI);
111     }
112   }
113 
114   return false;
115 }
116 
117 static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
118   return isInt<16>(Src.getImm()) &&
119     !TII->isInlineConstant(*Src.getParent(),
120                            Src.getParent()->getOperandNo(&Src));
121 }
122 
123 static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
124   return isUInt<16>(Src.getImm()) &&
125     !TII->isInlineConstant(*Src.getParent(),
126                            Src.getParent()->getOperandNo(&Src));
127 }
128 
129 static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
130                                  const MachineOperand &Src,
131                                  bool &IsUnsigned) {
132   if (isInt<16>(Src.getImm())) {
133     IsUnsigned = false;
134     return !TII->isInlineConstant(Src);
135   }
136 
137   if (isUInt<16>(Src.getImm())) {
138     IsUnsigned = true;
139     return !TII->isInlineConstant(Src);
140   }
141 
142   return false;
143 }
144 
145 /// \returns true if the constant in \p Src should be replaced with a bitreverse
146 /// of an inline immediate.
147 static bool isReverseInlineImm(const SIInstrInfo *TII,
148                                const MachineOperand &Src,
149                                int32_t &ReverseImm) {
150   if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
151     return false;
152 
153   ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
154   return ReverseImm >= -16 && ReverseImm <= 64;
155 }
156 
157 /// Copy implicit register operands from specified instruction to this
158 /// instruction that are not part of the instruction definition.
159 static void copyExtraImplicitOps(MachineInstr &NewMI, MachineFunction &MF,
160                                  const MachineInstr &MI) {
161   for (unsigned i = MI.getDesc().getNumOperands() +
162          MI.getDesc().getNumImplicitUses() +
163          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
164        i != e; ++i) {
165     const MachineOperand &MO = MI.getOperand(i);
166     if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
167       NewMI.addOperand(MF, MO);
168   }
169 }
170 
171 static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
172   // cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
173   // get constants on the RHS.
174   if (!MI.getOperand(0).isReg())
175     TII->commuteInstruction(MI, false, 0, 1);
176 
177   // cmpk requires src0 to be a register
178   const MachineOperand &Src0 = MI.getOperand(0);
179   if (!Src0.isReg())
180     return;
181 
182   const MachineOperand &Src1 = MI.getOperand(1);
183   if (!Src1.isImm())
184     return;
185 
186   int SOPKOpc = AMDGPU::getSOPKOp(MI.getOpcode());
187   if (SOPKOpc == -1)
188     return;
189 
190   // eq/ne is special because the imm16 can be treated as signed or unsigned,
191   // and initially selectd to the unsigned versions.
192   if (SOPKOpc == AMDGPU::S_CMPK_EQ_U32 || SOPKOpc == AMDGPU::S_CMPK_LG_U32) {
193     bool HasUImm;
194     if (isKImmOrKUImmOperand(TII, Src1, HasUImm)) {
195       if (!HasUImm) {
196         SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
197           AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
198       }
199 
200       MI.setDesc(TII->get(SOPKOpc));
201     }
202 
203     return;
204   }
205 
206   const MCInstrDesc &NewDesc = TII->get(SOPKOpc);
207 
208   if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(TII, Src1)) ||
209       (!TII->sopkIsZext(SOPKOpc) && isKImmOperand(TII, Src1))) {
210     MI.setDesc(NewDesc);
211   }
212 }
213 
214 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
215 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
216   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
217   if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
218     return;
219 
220   MachineFunction *MF = MI.getParent()->getParent();
221   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
222   const SIInstrInfo *TII = ST.getInstrInfo();
223   const SIRegisterInfo &TRI = TII->getRegisterInfo();
224   int VAddr0Idx =
225       AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
226   unsigned NewAddrDwords = Info->VAddrDwords;
227   const TargetRegisterClass *RC;
228 
229   if (Info->VAddrDwords == 2) {
230     RC = &AMDGPU::VReg_64RegClass;
231   } else if (Info->VAddrDwords == 3) {
232     RC = &AMDGPU::VReg_96RegClass;
233   } else if (Info->VAddrDwords == 4) {
234     RC = &AMDGPU::VReg_128RegClass;
235   } else if (Info->VAddrDwords <= 8) {
236     RC = &AMDGPU::VReg_256RegClass;
237     NewAddrDwords = 8;
238   } else {
239     RC = &AMDGPU::VReg_512RegClass;
240     NewAddrDwords = 16;
241   }
242 
243   unsigned VgprBase = 0;
244   bool IsUndef = true;
245   bool IsKill = NewAddrDwords == Info->VAddrDwords;
246   for (unsigned i = 0; i < Info->VAddrDwords; ++i) {
247     const MachineOperand &Op = MI.getOperand(VAddr0Idx + i);
248     unsigned Vgpr = TRI.getHWRegIndex(Op.getReg());
249 
250     if (i == 0) {
251       VgprBase = Vgpr;
252     } else if (VgprBase + i != Vgpr)
253       return;
254 
255     if (!Op.isUndef())
256       IsUndef = false;
257     if (!Op.isKill())
258       IsKill = false;
259   }
260 
261   if (VgprBase + NewAddrDwords > 256)
262     return;
263 
264   // Further check for implicit tied operands - this may be present if TFE is
265   // enabled
266   int TFEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
267   int LWEIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::lwe);
268   unsigned TFEVal = (TFEIdx == -1) ? 0 : MI.getOperand(TFEIdx).getImm();
269   unsigned LWEVal = (LWEIdx == -1) ? 0 : MI.getOperand(LWEIdx).getImm();
270   int ToUntie = -1;
271   if (TFEVal || LWEVal) {
272     // TFE/LWE is enabled so we need to deal with an implicit tied operand
273     for (unsigned i = LWEIdx + 1, e = MI.getNumOperands(); i != e; ++i) {
274       if (MI.getOperand(i).isReg() && MI.getOperand(i).isTied() &&
275           MI.getOperand(i).isImplicit()) {
276         // This is the tied operand
277         assert(
278             ToUntie == -1 &&
279             "found more than one tied implicit operand when expecting only 1");
280         ToUntie = i;
281         MI.untieRegOperand(ToUntie);
282       }
283     }
284   }
285 
286   unsigned NewOpcode =
287       AMDGPU::getMIMGOpcode(Info->BaseOpcode, AMDGPU::MIMGEncGfx10Default,
288                             Info->VDataDwords, NewAddrDwords);
289   MI.setDesc(TII->get(NewOpcode));
290   MI.getOperand(VAddr0Idx).setReg(RC->getRegister(VgprBase));
291   MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
292   MI.getOperand(VAddr0Idx).setIsKill(IsKill);
293 
294   for (unsigned i = 1; i < Info->VAddrDwords; ++i)
295     MI.RemoveOperand(VAddr0Idx + 1);
296 
297   if (ToUntie >= 0) {
298     MI.tieOperands(
299         AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
300         ToUntie - (Info->VAddrDwords - 1));
301   }
302 }
303 
304 /// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
305 /// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
306 /// If the inverse of the immediate is legal, use ANDN2, ORN2 or
307 /// XNOR (as a ^ b == ~(a ^ ~b)).
308 /// \returns true if the caller should continue the machine function iterator
309 static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
310                                 MachineRegisterInfo &MRI,
311                                 const SIInstrInfo *TII,
312                                 MachineInstr &MI) {
313   unsigned Opc = MI.getOpcode();
314   const MachineOperand *Dest = &MI.getOperand(0);
315   MachineOperand *Src0 = &MI.getOperand(1);
316   MachineOperand *Src1 = &MI.getOperand(2);
317   MachineOperand *SrcReg = Src0;
318   MachineOperand *SrcImm = Src1;
319 
320   if (!SrcImm->isImm() ||
321       AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
322     return false;
323 
324   uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
325   uint32_t NewImm = 0;
326 
327   if (Opc == AMDGPU::S_AND_B32) {
328     if (isPowerOf2_32(~Imm)) {
329       NewImm = countTrailingOnes(Imm);
330       Opc = AMDGPU::S_BITSET0_B32;
331     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
332       NewImm = ~Imm;
333       Opc = AMDGPU::S_ANDN2_B32;
334     }
335   } else if (Opc == AMDGPU::S_OR_B32) {
336     if (isPowerOf2_32(Imm)) {
337       NewImm = countTrailingZeros(Imm);
338       Opc = AMDGPU::S_BITSET1_B32;
339     } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
340       NewImm = ~Imm;
341       Opc = AMDGPU::S_ORN2_B32;
342     }
343   } else if (Opc == AMDGPU::S_XOR_B32) {
344     if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
345       NewImm = ~Imm;
346       Opc = AMDGPU::S_XNOR_B32;
347     }
348   } else {
349     llvm_unreachable("unexpected opcode");
350   }
351 
352   if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
353       SrcImm == Src0) {
354     if (!TII->commuteInstruction(MI, false, 1, 2))
355       NewImm = 0;
356   }
357 
358   if (NewImm != 0) {
359     if (Dest->getReg().isVirtual() && SrcReg->isReg()) {
360       MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
361       MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
362       return true;
363     }
364 
365     if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
366       const bool IsUndef = SrcReg->isUndef();
367       const bool IsKill = SrcReg->isKill();
368       MI.setDesc(TII->get(Opc));
369       if (Opc == AMDGPU::S_BITSET0_B32 ||
370           Opc == AMDGPU::S_BITSET1_B32) {
371         Src0->ChangeToImmediate(NewImm);
372         // Remove the immediate and add the tied input.
373         MI.getOperand(2).ChangeToRegister(Dest->getReg(), /*IsDef*/ false,
374                                           /*isImp*/ false, IsKill,
375                                           /*isDead*/ false, IsUndef);
376         MI.tieOperands(0, 2);
377       } else {
378         SrcImm->setImm(NewImm);
379       }
380     }
381   }
382 
383   return false;
384 }
385 
386 // This is the same as MachineInstr::readsRegister/modifiesRegister except
387 // it takes subregs into account.
388 static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
389                           Register Reg, unsigned SubReg,
390                           const SIRegisterInfo &TRI) {
391   for (const MachineOperand &MO : R) {
392     if (!MO.isReg())
393       continue;
394 
395     if (Reg.isPhysical() && MO.getReg().isPhysical()) {
396       if (TRI.regsOverlap(Reg, MO.getReg()))
397         return true;
398     } else if (MO.getReg() == Reg && Reg.isVirtual()) {
399       LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
400                             TRI.getSubRegIndexLaneMask(MO.getSubReg());
401       if (Overlap.any())
402         return true;
403     }
404   }
405   return false;
406 }
407 
408 static bool instReadsReg(const MachineInstr *MI,
409                          unsigned Reg, unsigned SubReg,
410                          const SIRegisterInfo &TRI) {
411   return instAccessReg(MI->uses(), Reg, SubReg, TRI);
412 }
413 
414 static bool instModifiesReg(const MachineInstr *MI,
415                             unsigned Reg, unsigned SubReg,
416                             const SIRegisterInfo &TRI) {
417   return instAccessReg(MI->defs(), Reg, SubReg, TRI);
418 }
419 
420 static TargetInstrInfo::RegSubRegPair
421 getSubRegForIndex(Register Reg, unsigned Sub, unsigned I,
422                   const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
423   if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
424     if (Reg.isPhysical()) {
425       Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
426     } else {
427       Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
428     }
429   }
430   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
431 }
432 
433 static void dropInstructionKeepingImpDefs(MachineInstr &MI,
434                                           const SIInstrInfo *TII) {
435   for (unsigned i = MI.getDesc().getNumOperands() +
436          MI.getDesc().getNumImplicitUses() +
437          MI.getDesc().getNumImplicitDefs(), e = MI.getNumOperands();
438        i != e; ++i) {
439     const MachineOperand &Op = MI.getOperand(i);
440     if (!Op.isDef())
441       continue;
442     BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
443             TII->get(AMDGPU::IMPLICIT_DEF), Op.getReg());
444   }
445 
446   MI.eraseFromParent();
447 }
448 
449 // Match:
450 // mov t, x
451 // mov x, y
452 // mov y, t
453 //
454 // =>
455 //
456 // mov t, x (t is potentially dead and move eliminated)
457 // v_swap_b32 x, y
458 //
459 // Returns next valid instruction pointer if was able to create v_swap_b32.
460 //
461 // This shall not be done too early not to prevent possible folding which may
462 // remove matched moves, and this should prefereably be done before RA to
463 // release saved registers and also possibly after RA which can insert copies
464 // too.
465 //
466 // This is really just a generic peephole that is not a canocical shrinking,
467 // although requirements match the pass placement and it reduces code size too.
468 static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
469                                const SIInstrInfo *TII) {
470   assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
471          MovT.getOpcode() == AMDGPU::COPY);
472 
473   Register T = MovT.getOperand(0).getReg();
474   unsigned Tsub = MovT.getOperand(0).getSubReg();
475   MachineOperand &Xop = MovT.getOperand(1);
476 
477   if (!Xop.isReg())
478     return nullptr;
479   Register X = Xop.getReg();
480   unsigned Xsub = Xop.getSubReg();
481 
482   unsigned Size = TII->getOpSize(MovT, 0) / 4;
483 
484   const SIRegisterInfo &TRI = TII->getRegisterInfo();
485   if (!TRI.isVGPR(MRI, X))
486     return nullptr;
487 
488   if (MovT.hasRegisterImplicitUseOperand(AMDGPU::M0))
489     return nullptr;
490 
491   const unsigned SearchLimit = 16;
492   unsigned Count = 0;
493   bool KilledT = false;
494   for (auto Iter = std::next(MovT.getIterator()),
495             E = MovT.getParent()->instr_end();
496        Iter != E && Count < SearchLimit && !KilledT; ++Iter, ++Count) {
497 
498     MachineInstr *MovY = &*Iter;
499     KilledT = MovY->killsRegister(T, &TRI);
500 
501     if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
502          MovY->getOpcode() != AMDGPU::COPY) ||
503         !MovY->getOperand(1).isReg()        ||
504         MovY->getOperand(1).getReg() != T   ||
505         MovY->getOperand(1).getSubReg() != Tsub ||
506         MovY->hasRegisterImplicitUseOperand(AMDGPU::M0))
507       continue;
508 
509     Register Y = MovY->getOperand(0).getReg();
510     unsigned Ysub = MovY->getOperand(0).getSubReg();
511 
512     if (!TRI.isVGPR(MRI, Y))
513       continue;
514 
515     MachineInstr *MovX = nullptr;
516     for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
517          I != IY; ++I) {
518       if (instReadsReg(&*I, X, Xsub, TRI)    ||
519           instModifiesReg(&*I, Y, Ysub, TRI) ||
520           instModifiesReg(&*I, T, Tsub, TRI) ||
521           (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
522         MovX = nullptr;
523         break;
524       }
525       if (!instReadsReg(&*I, Y, Ysub, TRI)) {
526         if (!MovX && instModifiesReg(&*I, X, Xsub, TRI)) {
527           MovX = nullptr;
528           break;
529         }
530         continue;
531       }
532       if (MovX ||
533           (I->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
534            I->getOpcode() != AMDGPU::COPY) ||
535           I->getOperand(0).getReg() != X ||
536           I->getOperand(0).getSubReg() != Xsub) {
537         MovX = nullptr;
538         break;
539       }
540       // Implicit use of M0 is an indirect move.
541       if (I->hasRegisterImplicitUseOperand(AMDGPU::M0))
542         continue;
543 
544       if (Size > 1 && (I->getNumImplicitOperands() > (I->isCopy() ? 0U : 1U)))
545         continue;
546 
547       MovX = &*I;
548     }
549 
550     if (!MovX)
551       continue;
552 
553     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << *MovY);
554 
555     for (unsigned I = 0; I < Size; ++I) {
556       TargetInstrInfo::RegSubRegPair X1, Y1;
557       X1 = getSubRegForIndex(X, Xsub, I, TRI, MRI);
558       Y1 = getSubRegForIndex(Y, Ysub, I, TRI, MRI);
559       MachineBasicBlock &MBB = *MovT.getParent();
560       auto MIB = BuildMI(MBB, MovX->getIterator(), MovT.getDebugLoc(),
561                          TII->get(AMDGPU::V_SWAP_B32))
562         .addDef(X1.Reg, 0, X1.SubReg)
563         .addDef(Y1.Reg, 0, Y1.SubReg)
564         .addReg(Y1.Reg, 0, Y1.SubReg)
565         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
566       if (MovX->hasRegisterImplicitUseOperand(AMDGPU::EXEC)) {
567         // Drop implicit EXEC.
568         MIB->RemoveOperand(MIB->getNumExplicitOperands());
569         MIB->copyImplicitOps(*MBB.getParent(), *MovX);
570       }
571     }
572     MovX->eraseFromParent();
573     dropInstructionKeepingImpDefs(*MovY, TII);
574     MachineInstr *Next = &*std::next(MovT.getIterator());
575 
576     if (MRI.use_nodbg_empty(T)) {
577       dropInstructionKeepingImpDefs(MovT, TII);
578     } else {
579       Xop.setIsKill(false);
580       for (int I = MovT.getNumImplicitOperands() - 1; I >= 0; --I ) {
581         unsigned OpNo = MovT.getNumExplicitOperands() + I;
582         const MachineOperand &Op = MovT.getOperand(OpNo);
583         if (Op.isKill() && TRI.regsOverlap(X, Op.getReg()))
584           MovT.RemoveOperand(OpNo);
585       }
586     }
587 
588     return Next;
589   }
590 
591   return nullptr;
592 }
593 
594 bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
595   if (skipFunction(MF.getFunction()))
596     return false;
597 
598   MachineRegisterInfo &MRI = MF.getRegInfo();
599   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
600   const SIInstrInfo *TII = ST.getInstrInfo();
601   unsigned VCCReg = ST.isWave32() ? AMDGPU::VCC_LO : AMDGPU::VCC;
602 
603   std::vector<unsigned> I1Defs;
604 
605   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
606                                                   BI != BE; ++BI) {
607 
608     MachineBasicBlock &MBB = *BI;
609     MachineBasicBlock::iterator I, Next;
610     for (I = MBB.begin(); I != MBB.end(); I = Next) {
611       Next = std::next(I);
612       MachineInstr &MI = *I;
613 
614       if (MI.getOpcode() == AMDGPU::V_MOV_B32_e32) {
615         // If this has a literal constant source that is the same as the
616         // reversed bits of an inline immediate, replace with a bitreverse of
617         // that constant. This saves 4 bytes in the common case of materializing
618         // sign bits.
619 
620         // Test if we are after regalloc. We only want to do this after any
621         // optimizations happen because this will confuse them.
622         // XXX - not exactly a check for post-regalloc run.
623         MachineOperand &Src = MI.getOperand(1);
624         if (Src.isImm() && MI.getOperand(0).getReg().isPhysical()) {
625           int32_t ReverseImm;
626           if (isReverseInlineImm(TII, Src, ReverseImm)) {
627             MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
628             Src.setImm(ReverseImm);
629             continue;
630           }
631         }
632       }
633 
634       if (ST.hasSwap() && (MI.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
635                            MI.getOpcode() == AMDGPU::COPY)) {
636         if (auto *NextMI = matchSwap(MI, MRI, TII)) {
637           Next = NextMI->getIterator();
638           continue;
639         }
640       }
641 
642       // FIXME: We also need to consider movs of constant operands since
643       // immediate operands are not folded if they have more than one use, and
644       // the operand folding pass is unaware if the immediate will be free since
645       // it won't know if the src == dest constraint will end up being
646       // satisfied.
647       if (MI.getOpcode() == AMDGPU::S_ADD_I32 ||
648           MI.getOpcode() == AMDGPU::S_MUL_I32) {
649         const MachineOperand *Dest = &MI.getOperand(0);
650         MachineOperand *Src0 = &MI.getOperand(1);
651         MachineOperand *Src1 = &MI.getOperand(2);
652 
653         if (!Src0->isReg() && Src1->isReg()) {
654           if (TII->commuteInstruction(MI, false, 1, 2))
655             std::swap(Src0, Src1);
656         }
657 
658         // FIXME: This could work better if hints worked with subregisters. If
659         // we have a vector add of a constant, we usually don't get the correct
660         // allocation due to the subregister usage.
661         if (Dest->getReg().isVirtual() && Src0->isReg()) {
662           MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
663           MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
664           continue;
665         }
666 
667         if (Src0->isReg() && Src0->getReg() == Dest->getReg()) {
668           if (Src1->isImm() && isKImmOperand(TII, *Src1)) {
669             unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
670               AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
671 
672             MI.setDesc(TII->get(Opc));
673             MI.tieOperands(0, 1);
674           }
675         }
676       }
677 
678       // Try to use s_cmpk_*
679       if (MI.isCompare() && TII->isSOPC(MI)) {
680         shrinkScalarCompare(TII, MI);
681         continue;
682       }
683 
684       // Try to use S_MOVK_I32, which will save 4 bytes for small immediates.
685       if (MI.getOpcode() == AMDGPU::S_MOV_B32) {
686         const MachineOperand &Dst = MI.getOperand(0);
687         MachineOperand &Src = MI.getOperand(1);
688 
689         if (Src.isImm() && Dst.getReg().isPhysical()) {
690           int32_t ReverseImm;
691           if (isKImmOperand(TII, Src))
692             MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
693           else if (isReverseInlineImm(TII, Src, ReverseImm)) {
694             MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
695             Src.setImm(ReverseImm);
696           }
697         }
698 
699         continue;
700       }
701 
702       // Shrink scalar logic operations.
703       if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
704           MI.getOpcode() == AMDGPU::S_OR_B32 ||
705           MI.getOpcode() == AMDGPU::S_XOR_B32) {
706         if (shrinkScalarLogicOp(ST, MRI, TII, MI))
707           continue;
708       }
709 
710       if (TII->isMIMG(MI.getOpcode()) &&
711           ST.getGeneration() >= AMDGPUSubtarget::GFX10 &&
712           MF.getProperties().hasProperty(
713               MachineFunctionProperties::Property::NoVRegs)) {
714         shrinkMIMG(MI);
715         continue;
716       }
717 
718       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
719         continue;
720 
721       if (!TII->canShrink(MI, MRI)) {
722         // Try commuting the instruction and see if that enables us to shrink
723         // it.
724         if (!MI.isCommutable() || !TII->commuteInstruction(MI) ||
725             !TII->canShrink(MI, MRI))
726           continue;
727       }
728 
729       // getVOPe32 could be -1 here if we started with an instruction that had
730       // a 32-bit encoding and then commuted it to an instruction that did not.
731       if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
732         continue;
733 
734       int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
735 
736       if (TII->isVOPC(Op32)) {
737         Register DstReg = MI.getOperand(0).getReg();
738         if (DstReg.isVirtual()) {
739           // VOPC instructions can only write to the VCC register. We can't
740           // force them to use VCC here, because this is only one register and
741           // cannot deal with sequences which would require multiple copies of
742           // VCC, e.g. S_AND_B64 (vcc = V_CMP_...), (vcc = V_CMP_...)
743           //
744           // So, instead of forcing the instruction to write to VCC, we provide
745           // a hint to the register allocator to use VCC and then we will run
746           // this pass again after RA and shrink it if it outputs to VCC.
747           MRI.setRegAllocationHint(MI.getOperand(0).getReg(), 0, VCCReg);
748           continue;
749         }
750         if (DstReg != VCCReg)
751           continue;
752       }
753 
754       if (Op32 == AMDGPU::V_CNDMASK_B32_e32) {
755         // We shrink V_CNDMASK_B32_e64 using regalloc hints like we do for VOPC
756         // instructions.
757         const MachineOperand *Src2 =
758             TII->getNamedOperand(MI, AMDGPU::OpName::src2);
759         if (!Src2->isReg())
760           continue;
761         Register SReg = Src2->getReg();
762         if (SReg.isVirtual()) {
763           MRI.setRegAllocationHint(SReg, 0, VCCReg);
764           continue;
765         }
766         if (SReg != VCCReg)
767           continue;
768       }
769 
770       // Check for the bool flag output for instructions like V_ADD_I32_e64.
771       const MachineOperand *SDst = TII->getNamedOperand(MI,
772                                                         AMDGPU::OpName::sdst);
773 
774       // Check the carry-in operand for v_addc_u32_e64.
775       const MachineOperand *Src2 = TII->getNamedOperand(MI,
776                                                         AMDGPU::OpName::src2);
777 
778       if (SDst) {
779         bool Next = false;
780 
781         if (SDst->getReg() != VCCReg) {
782           if (SDst->getReg().isVirtual())
783             MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
784           Next = true;
785         }
786 
787         // All of the instructions with carry outs also have an SGPR input in
788         // src2.
789         if (Src2 && Src2->getReg() != VCCReg) {
790           if (Src2->getReg().isVirtual())
791             MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
792           Next = true;
793         }
794 
795         if (Next)
796           continue;
797       }
798 
799       // We can shrink this instruction
800       LLVM_DEBUG(dbgs() << "Shrinking " << MI);
801 
802       MachineInstr *Inst32 = TII->buildShrunkInst(MI, Op32);
803       ++NumInstructionsShrunk;
804 
805       // Copy extra operands not present in the instruction definition.
806       copyExtraImplicitOps(*Inst32, MF, MI);
807 
808       MI.eraseFromParent();
809       foldImmediates(*Inst32, TII, MRI);
810 
811       LLVM_DEBUG(dbgs() << "e32 MI = " << *Inst32 << '\n');
812     }
813   }
814   return false;
815 }
816