1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
24 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
25 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
26 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
27 #include "llvm/CodeGen/MachineDominators.h"
28 #include "llvm/CodeGen/TargetPassConfig.h"
29 #include "llvm/Target/TargetMachine.h"
30 
31 #define GET_GICOMBINER_DEPS
32 #include "AMDGPUGenPreLegalizeGICombiner.inc"
33 #undef GET_GICOMBINER_DEPS
34 
35 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
36 
37 using namespace llvm;
38 using namespace MIPatternMatch;
39 namespace {
40 
41 #define GET_GICOMBINER_TYPES
42 #include "AMDGPUGenPreLegalizeGICombiner.inc"
43 #undef GET_GICOMBINER_TYPES
44 
45 class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor {
46 protected:
47   const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
48   const GCNSubtarget &STI;
49 
50   GISelChangeObserver &Observer;
51   MachineIRBuilder &B;
52   MachineFunction &MF;
53   MachineRegisterInfo &MRI;
54   AMDGPUCombinerHelper &Helper;
55 
56 public:
57   AMDGPUPreLegalizerCombinerImpl(
58       const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
59       const GCNSubtarget &STI, GISelChangeObserver &Observer,
60       MachineIRBuilder &B, AMDGPUCombinerHelper &Helper);
61 
62   static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; }
63 
64   bool tryCombineAll(MachineInstr &I) const;
65 
66   struct ClampI64ToI16MatchInfo {
67     int64_t Cmp1 = 0;
68     int64_t Cmp2 = 0;
69     Register Origin;
70   };
71 
72   bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI,
73                           const MachineFunction &MF,
74                           ClampI64ToI16MatchInfo &MatchInfo) const;
75 
76   void applyClampI64ToI16(MachineInstr &MI,
77                           const ClampI64ToI16MatchInfo &MatchInfo) const;
78 
79 private:
80 #define GET_GICOMBINER_CLASS_MEMBERS
81 #define AMDGPUSubtarget GCNSubtarget
82 #include "AMDGPUGenPreLegalizeGICombiner.inc"
83 #undef GET_GICOMBINER_CLASS_MEMBERS
84 #undef AMDGPUSubtarget
85 };
86 
87 #define GET_GICOMBINER_IMPL
88 #define AMDGPUSubtarget GCNSubtarget
89 #include "AMDGPUGenPreLegalizeGICombiner.inc"
90 #undef AMDGPUSubtarget
91 #undef GET_GICOMBINER_IMPL
92 
93 AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
94     const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
95     const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B,
96     AMDGPUCombinerHelper &Helper)
97     : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()),
98       MRI(*B.getMRI()), Helper(Helper),
99 #define GET_GICOMBINER_CONSTRUCTOR_INITS
100 #include "AMDGPUGenPreLegalizeGICombiner.inc"
101 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
102 {
103 }
104 
105 bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
106     MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
107     ClampI64ToI16MatchInfo &MatchInfo) const {
108   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
109 
110   // Try to find a pattern where an i64 value should get clamped to short.
111   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
112   if (SrcType != LLT::scalar(64))
113     return false;
114 
115   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
116   if (DstType != LLT::scalar(16))
117     return false;
118 
119   Register Base;
120 
121   auto IsApplicableForCombine = [&MatchInfo]() -> bool {
122     const auto Cmp1 = MatchInfo.Cmp1;
123     const auto Cmp2 = MatchInfo.Cmp2;
124     const auto Diff = std::abs(Cmp2 - Cmp1);
125 
126     // If the difference between both comparison values is 0 or 1, there is no
127     // need to clamp.
128     if (Diff == 0 || Diff == 1)
129       return false;
130 
131     const int64_t Min = std::numeric_limits<int16_t>::min();
132     const int64_t Max = std::numeric_limits<int16_t>::max();
133 
134     // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
135     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
136             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
137   };
138 
139   // Try to match a combination of min / max MIR opcodes.
140   if (mi_match(MI.getOperand(1).getReg(), MRI,
141                m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
142     if (mi_match(Base, MRI,
143                  m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
144       return IsApplicableForCombine();
145     }
146   }
147 
148   if (mi_match(MI.getOperand(1).getReg(), MRI,
149                m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
150     if (mi_match(Base, MRI,
151                  m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
152       return IsApplicableForCombine();
153     }
154   }
155 
156   return false;
157 }
158 
159 // We want to find a combination of instructions that
160 // gets generated when an i64 gets clamped to i16.
161 // The corresponding pattern is:
162 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
163 // This can be efficiently written as following:
164 // v_cvt_pk_i16_i32 v0, v0, v1
165 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
166 void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
167     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
168 
169   Register Src = MatchInfo.Origin;
170   assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
171          LLT::scalar(64));
172   const LLT S32 = LLT::scalar(32);
173 
174   B.setInstrAndDebugLoc(MI);
175 
176   auto Unmerge = B.buildUnmerge(S32, Src);
177 
178   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
179 
180   const LLT V2S16 = LLT::fixed_vector(2, 16);
181   auto CvtPk =
182       B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
183                    {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
184 
185   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
186   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
187   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
188   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
189 
190   auto Bitcast = B.buildBitcast({S32}, CvtPk);
191 
192   auto Med3 = B.buildInstr(
193       AMDGPU::G_AMDGPU_SMED3, {S32},
194       {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
195       MI.getFlags());
196 
197   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
198 
199   MI.eraseFromParent();
200 }
201 
202 class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
203   GISelKnownBits *KB;
204   MachineDominatorTree *MDT;
205   AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
206 
207 public:
208   AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
209                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
210       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
211                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
212         KB(KB), MDT(MDT) {
213     if (!RuleConfig.parseCommandLineOption())
214       report_fatal_error("Invalid rule identifier");
215   }
216 
217   bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
218                MachineIRBuilder &B) const override;
219 };
220 
221 bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
222                                              MachineInstr &MI,
223                                              MachineIRBuilder &B) const {
224   const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo();
225   AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI);
226 
227   const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>();
228   // TODO: Do not re-create the Impl on every inst, it should be per function.
229   AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper);
230   Impl.setupMF(*MI.getMF(), KB);
231 
232   if (Impl.tryCombineAll(MI))
233     return true;
234 
235   switch (MI.getOpcode()) {
236   case TargetOpcode::G_CONCAT_VECTORS:
237     return Helper.tryCombineConcatVectors(MI);
238   case TargetOpcode::G_SHUFFLE_VECTOR:
239     return Helper.tryCombineShuffleVector(MI);
240   }
241 
242   return false;
243 }
244 
245 // Pass boilerplate
246 // ================
247 
248 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
249 public:
250   static char ID;
251 
252   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
253 
254   StringRef getPassName() const override {
255     return "AMDGPUPreLegalizerCombiner";
256   }
257 
258   bool runOnMachineFunction(MachineFunction &MF) override;
259 
260   void getAnalysisUsage(AnalysisUsage &AU) const override;
261 
262 private:
263   bool IsOptNone;
264 };
265 } // end anonymous namespace
266 
267 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
268   AU.addRequired<TargetPassConfig>();
269   AU.setPreservesCFG();
270   getSelectionDAGFallbackAnalysisUsage(AU);
271   AU.addRequired<GISelKnownBitsAnalysis>();
272   AU.addPreserved<GISelKnownBitsAnalysis>();
273   if (!IsOptNone) {
274     AU.addRequired<MachineDominatorTree>();
275     AU.addPreserved<MachineDominatorTree>();
276   }
277 
278   AU.addRequired<GISelCSEAnalysisWrapperPass>();
279   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
280   MachineFunctionPass::getAnalysisUsage(AU);
281 }
282 
283 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
284     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
285   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
286 }
287 
288 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
289   if (MF.getProperties().hasProperty(
290           MachineFunctionProperties::Property::FailedISel))
291     return false;
292   auto *TPC = &getAnalysis<TargetPassConfig>();
293   const Function &F = MF.getFunction();
294   bool EnableOpt =
295       MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
296   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
297   MachineDominatorTree *MDT =
298       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
299   AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
300                                         F.hasMinSize(), KB, MDT);
301   // Enable CSE.
302   GISelCSEAnalysisWrapper &Wrapper =
303       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
304   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
305 
306   Combiner C(PCInfo, TPC);
307   return C.combineMachineInstrs(MF, CSEInfo);
308 }
309 
310 char AMDGPUPreLegalizerCombiner::ID = 0;
311 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
312                       "Combine AMDGPU machine instrs before legalization",
313                       false, false)
314 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
315 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
316 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
317                     "Combine AMDGPU machine instrs before legalization", false,
318                     false)
319 
320 namespace llvm {
321 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
322   return new AMDGPUPreLegalizerCombiner(IsOptNone);
323 }
324 } // end namespace llvm
325