1 //=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // after the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/Combiner.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
22 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
23 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
24 #include "llvm/CodeGen/MachineDominators.h"
25 #include "llvm/CodeGen/TargetPassConfig.h"
26 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #include "llvm/Target/TargetMachine.h"
28 
29 #define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
30 
31 using namespace llvm;
32 using namespace MIPatternMatch;
33 
34 class AMDGPUPostLegalizerCombinerHelper {
35 protected:
36   MachineIRBuilder &B;
37   MachineFunction &MF;
38   MachineRegisterInfo &MRI;
39   AMDGPUCombinerHelper &Helper;
40 
41 public:
42   AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
43                                     AMDGPUCombinerHelper &Helper)
44       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
45 
46   struct FMinFMaxLegacyInfo {
47     Register LHS;
48     Register RHS;
49     Register True;
50     Register False;
51     CmpInst::Predicate Pred;
52   };
53 
54   // TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
55   bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
56   void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
57                                          const FMinFMaxLegacyInfo &Info);
58 
59   bool matchUCharToFloat(MachineInstr &MI);
60   void applyUCharToFloat(MachineInstr &MI);
61 
62   bool matchRcpSqrtToRsq(MachineInstr &MI,
63                          std::function<void(MachineIRBuilder &)> &MatchInfo);
64 
65   // FIXME: Should be able to have 2 separate matchdatas rather than custom
66   // struct boilerplate.
67   struct CvtF32UByteMatchInfo {
68     Register CvtVal;
69     unsigned ShiftOffset;
70   };
71 
72   bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
73   void applyCvtF32UByteN(MachineInstr &MI,
74                          const CvtF32UByteMatchInfo &MatchInfo);
75 
76   bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
77 };
78 
79 bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
80     MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
81   // FIXME: Type predicate on pattern
82   if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
83     return false;
84 
85   Register Cond = MI.getOperand(1).getReg();
86   if (!MRI.hasOneNonDBGUse(Cond) ||
87       !mi_match(Cond, MRI,
88                 m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
89     return false;
90 
91   Info.True = MI.getOperand(2).getReg();
92   Info.False = MI.getOperand(3).getReg();
93 
94   if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
95       !(Info.LHS == Info.False && Info.RHS == Info.True))
96     return false;
97 
98   switch (Info.Pred) {
99   case CmpInst::FCMP_FALSE:
100   case CmpInst::FCMP_OEQ:
101   case CmpInst::FCMP_ONE:
102   case CmpInst::FCMP_ORD:
103   case CmpInst::FCMP_UNO:
104   case CmpInst::FCMP_UEQ:
105   case CmpInst::FCMP_UNE:
106   case CmpInst::FCMP_TRUE:
107     return false;
108   default:
109     return true;
110   }
111 }
112 
113 void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
114     MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
115   B.setInstrAndDebugLoc(MI);
116   auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
117     B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
118   };
119 
120   switch (Info.Pred) {
121   case CmpInst::FCMP_ULT:
122   case CmpInst::FCMP_ULE:
123     if (Info.LHS == Info.True)
124       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
125     else
126       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
127     break;
128   case CmpInst::FCMP_OLE:
129   case CmpInst::FCMP_OLT: {
130     // We need to permute the operands to get the correct NaN behavior. The
131     // selected operand is the second one based on the failing compare with NaN,
132     // so permute it based on the compare type the hardware uses.
133     if (Info.LHS == Info.True)
134       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
135     else
136       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
137     break;
138   }
139   case CmpInst::FCMP_UGE:
140   case CmpInst::FCMP_UGT: {
141     if (Info.LHS == Info.True)
142       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
143     else
144       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
145     break;
146   }
147   case CmpInst::FCMP_OGT:
148   case CmpInst::FCMP_OGE: {
149     if (Info.LHS == Info.True)
150       buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
151     else
152       buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
153     break;
154   }
155   default:
156     llvm_unreachable("predicate should not have matched");
157   }
158 
159   MI.eraseFromParent();
160 }
161 
162 bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
163   Register DstReg = MI.getOperand(0).getReg();
164 
165   // TODO: We could try to match extracting the higher bytes, which would be
166   // easier if i8 vectors weren't promoted to i32 vectors, particularly after
167   // types are legalized. v4i8 -> v4f32 is probably the only case to worry
168   // about in practice.
169   LLT Ty = MRI.getType(DstReg);
170   if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
171     Register SrcReg = MI.getOperand(1).getReg();
172     unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
173     assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
174     const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
175     return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
176   }
177 
178   return false;
179 }
180 
181 void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
182   B.setInstrAndDebugLoc(MI);
183 
184   const LLT S32 = LLT::scalar(32);
185 
186   Register DstReg = MI.getOperand(0).getReg();
187   Register SrcReg = MI.getOperand(1).getReg();
188   LLT Ty = MRI.getType(DstReg);
189   LLT SrcTy = MRI.getType(SrcReg);
190   if (SrcTy != S32)
191     SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
192 
193   if (Ty == S32) {
194     B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
195                    {SrcReg}, MI.getFlags());
196   } else {
197     auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
198                              {SrcReg}, MI.getFlags());
199     B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
200   }
201 
202   MI.eraseFromParent();
203 }
204 
205 bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
206     MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
207 
208   auto getRcpSrc = [=](const MachineInstr &MI) {
209     MachineInstr *ResMI = nullptr;
210     if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
211         MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
212       ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
213 
214     return ResMI;
215   };
216 
217   auto getSqrtSrc = [=](const MachineInstr &MI) {
218     MachineInstr *SqrtSrcMI = nullptr;
219     auto Match =
220         mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
221     (void)Match;
222     return SqrtSrcMI;
223   };
224 
225   MachineInstr *RcpSrcMI = nullptr, *SqrtSrcMI = nullptr;
226   // rcp(sqrt(x))
227   if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
228     MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
229       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
230           .addUse(SqrtSrcMI->getOperand(0).getReg())
231           .setMIFlags(MI.getFlags());
232     };
233     return true;
234   }
235 
236   // sqrt(rcp(x))
237   if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
238     MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
239       B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
240           .addUse(RcpSrcMI->getOperand(0).getReg())
241           .setMIFlags(MI.getFlags());
242     };
243     return true;
244   }
245 
246   return false;
247 }
248 
249 bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
250     MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
251   Register SrcReg = MI.getOperand(1).getReg();
252 
253   // Look through G_ZEXT.
254   bool IsShr = mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
255 
256   Register Src0;
257   int64_t ShiftAmt;
258   IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
259   if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
260     const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
261 
262     unsigned ShiftOffset = 8 * Offset;
263     if (IsShr)
264       ShiftOffset += ShiftAmt;
265     else
266       ShiftOffset -= ShiftAmt;
267 
268     MatchInfo.CvtVal = Src0;
269     MatchInfo.ShiftOffset = ShiftOffset;
270     return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
271   }
272 
273   // TODO: Simplify demanded bits.
274   return false;
275 }
276 
277 void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
278     MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
279   B.setInstrAndDebugLoc(MI);
280   unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
281 
282   const LLT S32 = LLT::scalar(32);
283   Register CvtSrc = MatchInfo.CvtVal;
284   LLT SrcTy = MRI.getType(MatchInfo.CvtVal);
285   if (SrcTy != S32) {
286     assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
287     CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
288   }
289 
290   assert(MI.getOpcode() != NewOpc);
291   B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
292   MI.eraseFromParent();
293 }
294 
295 bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
296     MachineInstr &MI, Register &Reg) {
297   const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
298       MF.getSubtarget().getTargetLowering());
299   Reg = MI.getOperand(1).getReg();
300   return TLI->isCanonicalized(Reg, MF);
301 }
302 
303 class AMDGPUPostLegalizerCombinerHelperState {
304 protected:
305   AMDGPUCombinerHelper &Helper;
306   AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
307 
308   // Note: pointer is necessary because Target Predicates use
309   //   "Subtarget->"
310   const GCNSubtarget *Subtarget;
311 
312 public:
313   AMDGPUPostLegalizerCombinerHelperState(
314       AMDGPUCombinerHelper &Helper,
315       AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper,
316       const GCNSubtarget &Subtarget)
317       : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper),
318         Subtarget(&Subtarget) {}
319 };
320 
321 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
322 #include "AMDGPUGenPostLegalizeGICombiner.inc"
323 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
324 
325 namespace {
326 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
327 #include "AMDGPUGenPostLegalizeGICombiner.inc"
328 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
329 
330 class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
331   GISelKnownBits *KB;
332   MachineDominatorTree *MDT;
333   const GCNSubtarget &Subtarget;
334 
335 public:
336   AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
337 
338   AMDGPUPostLegalizerCombinerInfo(const GCNSubtarget &Subtarget, bool EnableOpt,
339                                   bool OptSize, bool MinSize,
340                                   const AMDGPULegalizerInfo *LI,
341                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
342       : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
343                      /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
344         KB(KB), MDT(MDT), Subtarget(Subtarget) {
345     if (!GeneratedRuleCfg.parseCommandLineOption())
346       report_fatal_error("Invalid rule identifier");
347   }
348 
349   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
350                MachineIRBuilder &B) const override;
351 };
352 
353 bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
354                                               MachineInstr &MI,
355                                               MachineIRBuilder &B) const {
356   AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT,
357                               LInfo);
358   AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
359   AMDGPUGenPostLegalizerCombinerHelper Generated(
360       GeneratedRuleCfg, Helper, PostLegalizerHelper, Subtarget);
361 
362   if (Generated.tryCombineAll(Observer, MI, B))
363     return true;
364 
365   switch (MI.getOpcode()) {
366   case TargetOpcode::G_SHL:
367   case TargetOpcode::G_LSHR:
368   case TargetOpcode::G_ASHR:
369     // On some subtargets, 64-bit shift is a quarter rate instruction. In the
370     // common case, splitting this into a move and a 32-bit shift is faster and
371     // the same code size.
372     return Helper.tryCombineShiftToUnmerge(MI, 32);
373   }
374 
375   return false;
376 }
377 
378 #define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
379 #include "AMDGPUGenPostLegalizeGICombiner.inc"
380 #undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
381 
382 // Pass boilerplate
383 // ================
384 
385 class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
386 public:
387   static char ID;
388 
389   AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
390 
391   StringRef getPassName() const override {
392     return "AMDGPUPostLegalizerCombiner";
393   }
394 
395   bool runOnMachineFunction(MachineFunction &MF) override;
396 
397   void getAnalysisUsage(AnalysisUsage &AU) const override;
398 private:
399   bool IsOptNone;
400 };
401 } // end anonymous namespace
402 
403 void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
404   AU.addRequired<TargetPassConfig>();
405   AU.setPreservesCFG();
406   getSelectionDAGFallbackAnalysisUsage(AU);
407   AU.addRequired<GISelKnownBitsAnalysis>();
408   AU.addPreserved<GISelKnownBitsAnalysis>();
409   if (!IsOptNone) {
410     AU.addRequired<MachineDominatorTree>();
411     AU.addPreserved<MachineDominatorTree>();
412   }
413   MachineFunctionPass::getAnalysisUsage(AU);
414 }
415 
416 AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
417   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
418   initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
419 }
420 
421 bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
422   if (MF.getProperties().hasProperty(
423           MachineFunctionProperties::Property::FailedISel))
424     return false;
425   auto *TPC = &getAnalysis<TargetPassConfig>();
426   const Function &F = MF.getFunction();
427   bool EnableOpt =
428       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
429 
430   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
431   const AMDGPULegalizerInfo *LI
432     = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
433 
434   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
435   MachineDominatorTree *MDT =
436       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
437   AMDGPUPostLegalizerCombinerInfo PCInfo(ST, EnableOpt, F.hasOptSize(),
438                                          F.hasMinSize(), LI, KB, MDT);
439   Combiner C(PCInfo, TPC);
440   return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
441 }
442 
443 char AMDGPUPostLegalizerCombiner::ID = 0;
444 INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
445                       "Combine AMDGPU machine instrs after legalization",
446                       false, false)
447 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
448 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
449 INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
450                     "Combine AMDGPU machine instrs after legalization", false,
451                     false)
452 
453 namespace llvm {
454 FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
455   return new AMDGPUPostLegalizerCombiner(IsOptNone);
456 }
457 } // end namespace llvm
458