1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPULegalizerInfo.h"
16 #include "GCNSubtarget.h"
17 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
18 #include "llvm/CodeGen/GlobalISel/Combiner.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
20 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/MachineDominators.h"
24 #include "llvm/CodeGen/TargetPassConfig.h"
25 #include "llvm/Target/TargetMachine.h"
26 
27 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
28 
29 using namespace llvm;
30 using namespace MIPatternMatch;
31 
32 class AMDGPUPreLegalizerCombinerHelper {
33 protected:
34   MachineIRBuilder &B;
35   MachineFunction &MF;
36   MachineRegisterInfo &MRI;
37   CombinerHelper &Helper;
38 
39 public:
40   AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
41       : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
42 
43   struct ClampI64ToI16MatchInfo {
44     int64_t Cmp1 = 0;
45     int64_t Cmp2 = 0;
46     Register Origin;
47   };
48 
49   bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
50                           MachineFunction &MF,
51                           ClampI64ToI16MatchInfo &MatchInfo);
52 
53   void applyClampI64ToI16(MachineInstr &MI,
54                           const ClampI64ToI16MatchInfo &MatchInfo);
55 };
56 
57 bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
58     MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
59     ClampI64ToI16MatchInfo &MatchInfo) {
60   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
61 
62   // Try to find a pattern where an i64 value should get clamped to short.
63   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
64   if (SrcType != LLT::scalar(64))
65     return false;
66 
67   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
68   if (DstType != LLT::scalar(16))
69     return false;
70 
71   Register Base;
72 
73   auto IsApplicableForCombine = [&MatchInfo]() -> bool {
74     const auto Cmp1 = MatchInfo.Cmp1;
75     const auto Cmp2 = MatchInfo.Cmp2;
76     const auto Diff = std::abs(Cmp2 - Cmp1);
77 
78     // If the difference between both comparison values is 0 or 1, there is no
79     // need to clamp.
80     if (Diff == 0 || Diff == 1)
81       return false;
82 
83     const int64_t Min = std::numeric_limits<int16_t>::min();
84     const int64_t Max = std::numeric_limits<int16_t>::max();
85 
86     // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
87     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
88             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
89   };
90 
91   // Try to match a combination of min / max MIR opcodes.
92   if (mi_match(MI.getOperand(1).getReg(), MRI,
93                m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
94     if (mi_match(Base, MRI,
95                  m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
96       return IsApplicableForCombine();
97     }
98   }
99 
100   if (mi_match(MI.getOperand(1).getReg(), MRI,
101                m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
102     if (mi_match(Base, MRI,
103                  m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
104       return IsApplicableForCombine();
105     }
106   }
107 
108   return false;
109 }
110 
111 // We want to find a combination of instructions that
112 // gets generated when an i64 gets clamped to i16.
113 // The corresponding pattern is:
114 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
115 // This can be efficiently written as following:
116 // v_cvt_pk_i16_i32 v0, v0, v1
117 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
118 void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
119     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
120 
121   Register Src = MatchInfo.Origin;
122   assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
123          LLT::scalar(64));
124   const LLT S32 = LLT::scalar(32);
125 
126   B.setMBB(*MI.getParent());
127   B.setInstrAndDebugLoc(MI);
128 
129   auto Unmerge = B.buildUnmerge(S32, Src);
130 
131   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
132 
133   const LLT V2S16 = LLT::fixed_vector(2, 16);
134   auto CvtPk =
135       B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
136                    {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
137 
138   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
139   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
140   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
141   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
142 
143   auto Bitcast = B.buildBitcast({S32}, CvtPk);
144 
145   auto Med3 = B.buildInstr(
146       AMDGPU::G_AMDGPU_SMED3, {S32},
147       {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
148       MI.getFlags());
149 
150   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
151 
152   MI.eraseFromParent();
153 }
154 
155 class AMDGPUPreLegalizerCombinerHelperState {
156 protected:
157   CombinerHelper &Helper;
158   AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
159 
160 public:
161   AMDGPUPreLegalizerCombinerHelperState(
162       CombinerHelper &Helper,
163       AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
164       : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
165 };
166 
167 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
168 #include "AMDGPUGenPreLegalizeGICombiner.inc"
169 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
170 
171 namespace {
172 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
173 #include "AMDGPUGenPreLegalizeGICombiner.inc"
174 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
175 
176 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
177   GISelKnownBits *KB;
178   MachineDominatorTree *MDT;
179 
180 public:
181   AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
182 
183   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
184                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
185       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
186                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
187         KB(KB), MDT(MDT) {
188     if (!GeneratedRuleCfg.parseCommandLineOption())
189       report_fatal_error("Invalid rule identifier");
190   }
191 
192   virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
193                        MachineIRBuilder &B) const override;
194 };
195 
196 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
197                                               MachineInstr &MI,
198                                               MachineIRBuilder &B) const {
199   CombinerHelper Helper(Observer, B, KB, MDT);
200   AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
201   AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
202                                                 PreLegalizerHelper);
203 
204   if (Generated.tryCombineAll(Observer, MI, B, Helper))
205     return true;
206 
207   switch (MI.getOpcode()) {
208   case TargetOpcode::G_MEMCPY_INLINE:
209     return Helper.tryEmitMemcpyInline(MI);
210   case TargetOpcode::G_CONCAT_VECTORS:
211     return Helper.tryCombineConcatVectors(MI);
212   case TargetOpcode::G_SHUFFLE_VECTOR:
213     return Helper.tryCombineShuffleVector(MI);
214   }
215 
216   return false;
217 }
218 
219 #define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
220 #include "AMDGPUGenPreLegalizeGICombiner.inc"
221 #undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
222 
223 // Pass boilerplate
224 // ================
225 
226 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
227 public:
228   static char ID;
229 
230   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
231 
232   StringRef getPassName() const override {
233     return "AMDGPUPreLegalizerCombiner";
234   }
235 
236   bool runOnMachineFunction(MachineFunction &MF) override;
237 
238   void getAnalysisUsage(AnalysisUsage &AU) const override;
239 private:
240   bool IsOptNone;
241 };
242 } // end anonymous namespace
243 
244 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
245   AU.addRequired<TargetPassConfig>();
246   AU.setPreservesCFG();
247   getSelectionDAGFallbackAnalysisUsage(AU);
248   AU.addRequired<GISelKnownBitsAnalysis>();
249   AU.addPreserved<GISelKnownBitsAnalysis>();
250   if (!IsOptNone) {
251     AU.addRequired<MachineDominatorTree>();
252     AU.addPreserved<MachineDominatorTree>();
253   }
254 
255   AU.addRequired<GISelCSEAnalysisWrapperPass>();
256   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
257   MachineFunctionPass::getAnalysisUsage(AU);
258 }
259 
260 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
261   : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
262   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
263 }
264 
265 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
266   if (MF.getProperties().hasProperty(
267           MachineFunctionProperties::Property::FailedISel))
268     return false;
269   auto *TPC = &getAnalysis<TargetPassConfig>();
270   const Function &F = MF.getFunction();
271   bool EnableOpt =
272       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
273   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
274   MachineDominatorTree *MDT =
275       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
276   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
277                                         F.hasMinSize(), KB, MDT);
278   // Enable CSE.
279   GISelCSEAnalysisWrapper &Wrapper =
280       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
281   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
282 
283   Combiner C(PCInfo, TPC);
284   return C.combineMachineInstrs(MF, CSEInfo);
285 }
286 
287 char AMDGPUPreLegalizerCombiner::ID = 0;
288 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
289                       "Combine AMDGPU machine instrs before legalization",
290                       false, false)
291 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
292 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
293 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
294                     "Combine AMDGPU machine instrs before legalization", false,
295                     false)
296 
297 namespace llvm {
298 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
299   return new AMDGPUPreLegalizerCombiner(IsOptNone);
300 }
301 } // end namespace llvm
302