1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
23 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
24 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/MachineDominators.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/IR/IntrinsicsAMDGPU.h"
29 #include "llvm/Target/TargetMachine.h"
30 
31 #define GET_GICOMBINER_DEPS
32 #include "AMDGPUGenPreLegalizeGICombiner.inc"
33 #undef GET_GICOMBINER_DEPS
34 
35 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
36 
37 using namespace llvm;
38 using namespace MIPatternMatch;
39 
40 namespace {
41 #define GET_GICOMBINER_TYPES
42 #include "AMDGPUGenPostLegalizeGICombiner.inc"
43 #undef GET_GICOMBINER_TYPES
44 
45 class AMDGPUPostLegalizerCombinerImpl : public Combiner {
46 protected:
47   const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
48   const GCNSubtarget &STI;
49   const SIInstrInfo &TII;
50   // TODO: Make CombinerHelper methods const.
51   mutable AMDGPUCombinerHelper Helper;
52 
53 public:
54   AMDGPUPostLegalizerCombinerImpl(
55       MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
56       GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
57       const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
58       const GCNSubtarget &STI, MachineDominatorTree *MDT,
59       const LegalizerInfo *LI);
60 
61   static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
62 
63   bool tryCombineAllImpl(MachineInstr &I) const;
64   bool tryCombineAll(MachineInstr &I) const override;
65 
66   struct FMinFMaxLegacyInfo {
67     Register LHS;
68     Register RHS;
69     Register True;
70     Register False;
71     CmpInst::Predicate Pred;
72   };
73 
74   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
75   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const;
76   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
77                                          const FMinFMaxLegacyInfo &Info) const;
78 
79   bool matchUCharToFloat(MachineInstr &MI) const;
80   void applyUCharToFloat(MachineInstr &MI) const;
81 
82   bool
83   matchRcpSqrtToRsq(MachineInstr &MI,
84                     std::function<void(MachineIRBuilder &)> &MatchInfo) const;
85 
86   // FIXME: Should be able to have 2 separate matchdatas rather than custom
87   // struct boilerplate.
88   struct CvtF32UByteMatchInfo {
89     Register CvtVal;
90     unsigned ShiftOffset;
91   };
92 
93   bool matchCvtF32UByteN(MachineInstr &MI,
94                          CvtF32UByteMatchInfo &MatchInfo) const;
95   void applyCvtF32UByteN(MachineInstr &MI,
96                          const CvtF32UByteMatchInfo &MatchInfo) const;
97 
98   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
99 
100   // Combine unsigned buffer load and signed extension instructions to generate
101   // signed buffer laod instructions.
102   bool matchCombineSignExtendInReg(MachineInstr &MI,
103                                    MachineInstr *&MatchInfo) const;
104   void applyCombineSignExtendInReg(MachineInstr &MI,
105                                    MachineInstr *&MatchInfo) const;
106 
107 private:
108 #define GET_GICOMBINER_CLASS_MEMBERS
109 #define AMDGPUSubtarget GCNSubtarget
110 #include "AMDGPUGenPostLegalizeGICombiner.inc"
111 #undef GET_GICOMBINER_CLASS_MEMBERS
112 #undef AMDGPUSubtarget
113 };
114 
115 #define GET_GICOMBINER_IMPL
116 #define AMDGPUSubtarget GCNSubtarget
117 #include "AMDGPUGenPostLegalizeGICombiner.inc"
118 #undef AMDGPUSubtarget
119 #undef GET_GICOMBINER_IMPL
120 
121 AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
122     MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
123     GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
124     const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
125     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
126     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
127       TII(*STI.getInstrInfo()),
128       Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
129 #define GET_GICOMBINER_CONSTRUCTOR_INITS
130 #include "AMDGPUGenPostLegalizeGICombiner.inc"
131 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
132 {
133 }
134 
135 bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
136   if (tryCombineAllImpl(MI))
137     return true;
138 
139   switch (MI.getOpcode()) {
140   case TargetOpcode::G_SHL:
141   case TargetOpcode::G_LSHR:
142   case TargetOpcode::G_ASHR:
143     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
144     // common case, splitting this into a move and a 32-bit shift is faster and
145     // the same code size.
146     return Helper.tryCombineShiftToUnmerge(MI, 32);
147   }
148 
149   return false;
150 }
151 
152 bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
153     MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
154   // FIXME: Type predicate on pattern
155   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
156     return false;
157 
158   Register Cond = MI.getOperand(1).getReg();
159   if (!MRI.hasOneNonDBGUse(Cond) ||
160       !mi_match(Cond, MRI,
161                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
162     return false;
163 
164   Info.True = MI.getOperand(2).getReg();
165   Info.False = MI.getOperand(3).getReg();
166 
167   // TODO: Handle case where the the selected value is an fneg and the compared
168   // constant is the negation of the selected value.
169   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
170       !(Info.LHS == Info.False && Info.RHS == Info.True))
171     return false;
172 
173   switch (Info.Pred) {
174   case CmpInst::FCMP_FALSE:
175   case CmpInst::FCMP_OEQ:
176   case CmpInst::FCMP_ONE:
177   case CmpInst::FCMP_ORD:
178   case CmpInst::FCMP_UNO:
179   case CmpInst::FCMP_UEQ:
180   case CmpInst::FCMP_UNE:
181   case CmpInst::FCMP_TRUE:
182     return false;
183   default:
184     return true;
185   }
186 }
187 
188 void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
189     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
190   B.setInstrAndDebugLoc(MI);
191   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
192     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
193   };
194 
195   switch (Info.Pred) {
196   case CmpInst::FCMP_ULT:
197   case CmpInst::FCMP_ULE:
198     if (Info.LHS == Info.True)
199       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
200     else
201       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
202     break;
203   case CmpInst::FCMP_OLE:
204   case CmpInst::FCMP_OLT: {
205     // We need to permute the operands to get the correct NaN behavior. The
206     // selected operand is the second one based on the failing compare with NaN,
207     // so permute it based on the compare type the hardware uses.
208     if (Info.LHS == Info.True)
209       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
210     else
211       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
212     break;
213   }
214   case CmpInst::FCMP_UGE:
215   case CmpInst::FCMP_UGT: {
216     if (Info.LHS == Info.True)
217       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
218     else
219       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
220     break;
221   }
222   case CmpInst::FCMP_OGT:
223   case CmpInst::FCMP_OGE: {
224     if (Info.LHS == Info.True)
225       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
226     else
227       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
228     break;
229   }
230   default:
231     llvm_unreachable("predicate should not have matched");
232   }
233 
234   MI.eraseFromParent();
235 }
236 
237 bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
238     MachineInstr &MI) const {
239   Register DstReg = MI.getOperand(0).getReg();
240 
241   // TODO: We could try to match extracting the higher bytes, which would be
242   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
243   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
244   // about in practice.
245   LLT Ty = MRI.getType(DstReg);
246   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
247     Register SrcReg = MI.getOperand(1).getReg();
248     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
249     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
250     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
251     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
252   }
253 
254   return false;
255 }
256 
257 void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
258     MachineInstr &MI) const {
259   B.setInstrAndDebugLoc(MI);
260 
261   const LLT S32 = LLT::scalar(32);
262 
263   Register DstReg = MI.getOperand(0).getReg();
264   Register SrcReg = MI.getOperand(1).getReg();
265   LLT Ty = MRI.getType(DstReg);
266   LLT SrcTy = MRI.getType(SrcReg);
267   if (SrcTy != S32)
268     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
269 
270   if (Ty == S32) {
271     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
272                  MI.getFlags());
273   } else {
274     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
275                              MI.getFlags());
276     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
277   }
278 
279   MI.eraseFromParent();
280 }
281 
282 bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
283     MachineInstr &MI,
284     std::function<void(MachineIRBuilder &)> &MatchInfo) const {
285   auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * {
286     if (!MI.getFlag(MachineInstr::FmContract))
287       return nullptr;
288 
289     if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
290       if (GI->is(Intrinsic::amdgcn_rcp))
291         return MRI.getVRegDef(MI.getOperand(2).getReg());
292     }
293     return nullptr;
294   };
295 
296   auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * {
297     if (!MI.getFlag(MachineInstr::FmContract))
298       return nullptr;
299     MachineInstr *SqrtSrcMI = nullptr;
300     auto Match =
301         mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
302     (void)Match;
303     return SqrtSrcMI;
304   };
305 
306   MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
307   // rcp(sqrt(x))
308   if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
309     MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
310       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
311           .addUse(SqrtSrcMI->getOperand(0).getReg())
312           .setMIFlags(MI.getFlags());
313     };
314     return true;
315   }
316 
317   // sqrt(rcp(x))
318   if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
319     MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
320       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
321           .addUse(RcpSrcMI->getOperand(0).getReg())
322           .setMIFlags(MI.getFlags());
323     };
324     return true;
325   }
326   return false;
327 }
328 
329 bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
330     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
331   Register SrcReg = MI.getOperand(1).getReg();
332 
333   // Look through G_ZEXT.
334   bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
335 
336   Register Src0;
337   int64_t ShiftAmt;
338   IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
339   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
340     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
341 
342     unsigned ShiftOffset = 8 * Offset;
343     if (IsShr)
344       ShiftOffset += ShiftAmt;
345     else
346       ShiftOffset -= ShiftAmt;
347 
348     MatchInfo.CvtVal = Src0;
349     MatchInfo.ShiftOffset = ShiftOffset;
350     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
351   }
352 
353   // TODO: Simplify demanded bits.
354   return false;
355 }
356 
357 void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
358     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
359   B.setInstrAndDebugLoc(MI);
360   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
361 
362   const LLT S32 = LLT::scalar(32);
363   Register CvtSrc = MatchInfo.CvtVal;
364   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
365   if (SrcTy != S32) {
366     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
367     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
368   }
369 
370   assert(MI.getOpcode() != NewOpc);
371   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
372   MI.eraseFromParent();
373 }
374 
375 bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
376     MachineInstr &MI, Register &Reg) const {
377   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
378       MF.getSubtarget().getTargetLowering());
379   Reg = MI.getOperand(1).getReg();
380   return TLI->isCanonicalized(Reg, MF);
381 }
382 
383 // The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
384 // u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
385 // with sign extension instrucions in order to generate buffer_load_{i8, i16}
386 // instructions.
387 
388 // Identify buffer_load_{u8, u16}.
389 bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
390     MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
391   Register Op0Reg = MI.getOperand(1).getReg();
392   SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
393 
394   if (!MRI.hasOneNonDBGUse(Op0Reg))
395     return false;
396 
397   // Check if the first operand of the sign extension is a subword buffer load
398   // instruction.
399   return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
400          SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
401 }
402 
403 // Combine buffer_load_{u8, u16} and the sign extension instruction to generate
404 // buffer_load_{i8, i16}.
405 void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
406     MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
407   // Modify the opcode and the destination of buffer_load_{u8, u16}:
408   // Replace the opcode.
409   unsigned Opc =
410       SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
411           ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
412           : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
413   SubwordBufferLoad->setDesc(TII.get(Opc));
414   // Update the destination register of SubwordBufferLoad with the destination
415   // register of the sign extension.
416   Register SignExtendInsnDst = MI.getOperand(0).getReg();
417   SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
418   // Remove the sign extension.
419   MI.eraseFromParent();
420 }
421 
422 // Pass boilerplate
423 // ================
424 
425 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
426 public:
427   static char ID;
428 
429   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
430 
431   StringRef getPassName() const override {
432     return "AMDGPUPostLegalizerCombiner";
433   }
434 
435   bool runOnMachineFunction(MachineFunction &MF) override;
436 
437   void getAnalysisUsage(AnalysisUsage &AU) const override;
438 
439 private:
440   bool IsOptNone;
441   AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
442 };
443 } // end anonymous namespace
444 
445 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
446   AU.addRequired<TargetPassConfig>();
447   AU.setPreservesCFG();
448   getSelectionDAGFallbackAnalysisUsage(AU);
449   AU.addRequired<GISelKnownBitsAnalysis>();
450   AU.addPreserved<GISelKnownBitsAnalysis>();
451   if (!IsOptNone) {
452     AU.addRequired<MachineDominatorTree>();
453     AU.addPreserved<MachineDominatorTree>();
454   }
455   MachineFunctionPass::getAnalysisUsage(AU);
456 }
457 
458 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
459     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
460   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
461 
462   if (!RuleConfig.parseCommandLineOption())
463     report_fatal_error("Invalid rule identifier");
464 }
465 
466 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
467   if (MF.getProperties().hasProperty(
468           MachineFunctionProperties::Property::FailedISel))
469     return false;
470   auto *TPC = &getAnalysis<TargetPassConfig>();
471   const Function &F = MF.getFunction();
472   bool EnableOpt =
473       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
474 
475   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
476   const AMDGPULegalizerInfo *LI =
477       static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
478 
479   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
480   MachineDominatorTree *MDT =
481       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
482 
483   CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
484                      LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
485 
486   AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
487                                        RuleConfig, ST, MDT, LI);
488   return Impl.combineMachineInstrs();
489 }
490 
491 char AMDGPUPostLegalizerCombiner::ID = 0;
492 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
493                       "Combine AMDGPU machine instrs after legalization", false,
494                       false)
495 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
496 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
497 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
498                     "Combine AMDGPU machine instrs after legalization", false,
499                     false)
500 
501 namespace llvm {
502 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
503   return new AMDGPUPostLegalizerCombiner(IsOptNone);
504 }
505 } // end namespace llvm
506