1 //=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AMDGPU.h"
15 #include "AMDGPUCombinerHelper.h"
16 #include "AMDGPULegalizerInfo.h"
17 #include "GCNSubtarget.h"
18 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
19 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
20 #include "llvm/CodeGen/GlobalISel/Combiner.h"
21 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
22 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
23 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
24 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
25 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
26 #include "llvm/CodeGen/MachineDominators.h"
27 #include "llvm/CodeGen/TargetPassConfig.h"
28 #include "llvm/Target/TargetMachine.h"
29 
30 #define GET_GICOMBINER_DEPS
31 #include "AMDGPUGenPreLegalizeGICombiner.inc"
32 #undef GET_GICOMBINER_DEPS
33 
34 #define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
35 
36 using namespace llvm;
37 using namespace MIPatternMatch;
38 namespace {
39 
40 #define GET_GICOMBINER_TYPES
41 #include "AMDGPUGenPreLegalizeGICombiner.inc"
42 #undef GET_GICOMBINER_TYPES
43 
44 class AMDGPUPreLegalizerCombinerImpl : public Combiner {
45 protected:
46   const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
47   const GCNSubtarget &STI;
48   // TODO: Make CombinerHelper methods const.
49   mutable AMDGPUCombinerHelper Helper;
50 
51 public:
52   AMDGPUPreLegalizerCombinerImpl(
53       MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
54       GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
55       const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
56       const GCNSubtarget &STI, MachineDominatorTree *MDT,
57       const LegalizerInfo *LI);
58 
getName()59   static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; }
60 
61   bool tryCombineAllImpl(MachineInstr &MI) const;
62   bool tryCombineAll(MachineInstr &I) const override;
63 
64   struct ClampI64ToI16MatchInfo {
65     int64_t Cmp1 = 0;
66     int64_t Cmp2 = 0;
67     Register Origin;
68   };
69 
70   bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI,
71                           const MachineFunction &MF,
72                           ClampI64ToI16MatchInfo &MatchInfo) const;
73 
74   void applyClampI64ToI16(MachineInstr &MI,
75                           const ClampI64ToI16MatchInfo &MatchInfo) const;
76 
77 private:
78 #define GET_GICOMBINER_CLASS_MEMBERS
79 #define AMDGPUSubtarget GCNSubtarget
80 #include "AMDGPUGenPreLegalizeGICombiner.inc"
81 #undef GET_GICOMBINER_CLASS_MEMBERS
82 #undef AMDGPUSubtarget
83 };
84 
85 #define GET_GICOMBINER_IMPL
86 #define AMDGPUSubtarget GCNSubtarget
87 #include "AMDGPUGenPreLegalizeGICombiner.inc"
88 #undef AMDGPUSubtarget
89 #undef GET_GICOMBINER_IMPL
90 
AMDGPUPreLegalizerCombinerImpl(MachineFunction & MF,CombinerInfo & CInfo,const TargetPassConfig * TPC,GISelKnownBits & KB,GISelCSEInfo * CSEInfo,const AMDGPUPreLegalizerCombinerImplRuleConfig & RuleConfig,const GCNSubtarget & STI,MachineDominatorTree * MDT,const LegalizerInfo * LI)91 AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
92     MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
93     GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
94     const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
95     const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
96     : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
97       Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
98 #define GET_GICOMBINER_CONSTRUCTOR_INITS
99 #include "AMDGPUGenPreLegalizeGICombiner.inc"
100 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
101 {
102 }
103 
tryCombineAll(MachineInstr & MI) const104 bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
105   if (tryCombineAllImpl(MI))
106     return true;
107 
108   switch (MI.getOpcode()) {
109   case TargetOpcode::G_CONCAT_VECTORS:
110     return Helper.tryCombineConcatVectors(MI);
111   case TargetOpcode::G_SHUFFLE_VECTOR:
112     return Helper.tryCombineShuffleVector(MI);
113   }
114 
115   return false;
116 }
117 
matchClampI64ToI16(MachineInstr & MI,const MachineRegisterInfo & MRI,const MachineFunction & MF,ClampI64ToI16MatchInfo & MatchInfo) const118 bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
119     MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
120     ClampI64ToI16MatchInfo &MatchInfo) const {
121   assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
122 
123   // Try to find a pattern where an i64 value should get clamped to short.
124   const LLT SrcType = MRI.getType(MI.getOperand(1).getReg());
125   if (SrcType != LLT::scalar(64))
126     return false;
127 
128   const LLT DstType = MRI.getType(MI.getOperand(0).getReg());
129   if (DstType != LLT::scalar(16))
130     return false;
131 
132   Register Base;
133 
134   auto IsApplicableForCombine = [&MatchInfo]() -> bool {
135     const auto Cmp1 = MatchInfo.Cmp1;
136     const auto Cmp2 = MatchInfo.Cmp2;
137     const auto Diff = std::abs(Cmp2 - Cmp1);
138 
139     // If the difference between both comparison values is 0 or 1, there is no
140     // need to clamp.
141     if (Diff == 0 || Diff == 1)
142       return false;
143 
144     const int64_t Min = std::numeric_limits<int16_t>::min();
145     const int64_t Max = std::numeric_limits<int16_t>::max();
146 
147     // Check if the comparison values are between SHORT_MIN and SHORT_MAX.
148     return ((Cmp2 >= Cmp1 && Cmp1 >= Min && Cmp2 <= Max) ||
149             (Cmp1 >= Cmp2 && Cmp1 <= Max && Cmp2 >= Min));
150   };
151 
152   // Try to match a combination of min / max MIR opcodes.
153   if (mi_match(MI.getOperand(1).getReg(), MRI,
154                m_GSMin(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
155     if (mi_match(Base, MRI,
156                  m_GSMax(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
157       return IsApplicableForCombine();
158     }
159   }
160 
161   if (mi_match(MI.getOperand(1).getReg(), MRI,
162                m_GSMax(m_Reg(Base), m_ICst(MatchInfo.Cmp1)))) {
163     if (mi_match(Base, MRI,
164                  m_GSMin(m_Reg(MatchInfo.Origin), m_ICst(MatchInfo.Cmp2)))) {
165       return IsApplicableForCombine();
166     }
167   }
168 
169   return false;
170 }
171 
172 // We want to find a combination of instructions that
173 // gets generated when an i64 gets clamped to i16.
174 // The corresponding pattern is:
175 // G_MAX / G_MAX for i16 <= G_TRUNC i64.
176 // This can be efficiently written as following:
177 // v_cvt_pk_i16_i32 v0, v0, v1
178 // v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
applyClampI64ToI16(MachineInstr & MI,const ClampI64ToI16MatchInfo & MatchInfo) const179 void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
180     MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
181 
182   Register Src = MatchInfo.Origin;
183   assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
184          LLT::scalar(64));
185   const LLT S32 = LLT::scalar(32);
186 
187   B.setInstrAndDebugLoc(MI);
188 
189   auto Unmerge = B.buildUnmerge(S32, Src);
190 
191   assert(MI.getOpcode() != AMDGPU::G_AMDGPU_CVT_PK_I16_I32);
192 
193   const LLT V2S16 = LLT::fixed_vector(2, 16);
194   auto CvtPk =
195       B.buildInstr(AMDGPU::G_AMDGPU_CVT_PK_I16_I32, {V2S16},
196                    {Unmerge.getReg(0), Unmerge.getReg(1)}, MI.getFlags());
197 
198   auto MinBoundary = std::min(MatchInfo.Cmp1, MatchInfo.Cmp2);
199   auto MaxBoundary = std::max(MatchInfo.Cmp1, MatchInfo.Cmp2);
200   auto MinBoundaryDst = B.buildConstant(S32, MinBoundary);
201   auto MaxBoundaryDst = B.buildConstant(S32, MaxBoundary);
202 
203   auto Bitcast = B.buildBitcast({S32}, CvtPk);
204 
205   auto Med3 = B.buildInstr(
206       AMDGPU::G_AMDGPU_SMED3, {S32},
207       {MinBoundaryDst.getReg(0), Bitcast.getReg(0), MaxBoundaryDst.getReg(0)},
208       MI.getFlags());
209 
210   B.buildTrunc(MI.getOperand(0).getReg(), Med3);
211 
212   MI.eraseFromParent();
213 }
214 
215 // Pass boilerplate
216 // ================
217 
218 class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
219 public:
220   static char ID;
221 
222   AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
223 
getPassName() const224   StringRef getPassName() const override {
225     return "AMDGPUPreLegalizerCombiner";
226   }
227 
228   bool runOnMachineFunction(MachineFunction &MF) override;
229 
230   void getAnalysisUsage(AnalysisUsage &AU) const override;
231 
232 private:
233   bool IsOptNone;
234   AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
235 };
236 } // end anonymous namespace
237 
getAnalysisUsage(AnalysisUsage & AU) const238 void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
239   AU.addRequired<TargetPassConfig>();
240   AU.setPreservesCFG();
241   getSelectionDAGFallbackAnalysisUsage(AU);
242   AU.addRequired<GISelKnownBitsAnalysis>();
243   AU.addPreserved<GISelKnownBitsAnalysis>();
244   if (!IsOptNone) {
245     AU.addRequired<MachineDominatorTree>();
246     AU.addPreserved<MachineDominatorTree>();
247   }
248 
249   AU.addRequired<GISelCSEAnalysisWrapperPass>();
250   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
251   MachineFunctionPass::getAnalysisUsage(AU);
252 }
253 
AMDGPUPreLegalizerCombiner(bool IsOptNone)254 AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
255     : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
256   initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
257 
258   if (!RuleConfig.parseCommandLineOption())
259     report_fatal_error("Invalid rule identifier");
260 }
261 
runOnMachineFunction(MachineFunction & MF)262 bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
263   if (MF.getProperties().hasProperty(
264           MachineFunctionProperties::Property::FailedISel))
265     return false;
266   auto *TPC = &getAnalysis<TargetPassConfig>();
267   const Function &F = MF.getFunction();
268   bool EnableOpt =
269       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
270   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
271 
272   // Enable CSE.
273   GISelCSEAnalysisWrapper &Wrapper =
274       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
275   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
276 
277   const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
278   MachineDominatorTree *MDT =
279       IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
280   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
281                      nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
282   AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
283                                       STI, MDT, STI.getLegalizerInfo());
284   return Impl.combineMachineInstrs();
285 }
286 
287 char AMDGPUPreLegalizerCombiner::ID = 0;
288 INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
289                       "Combine AMDGPU machine instrs before legalization",
290                       false, false)
291 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
292 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
293 INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
294                     "Combine AMDGPU machine instrs before legalization", false,
295                     false)
296 
297 namespace llvm {
createAMDGPUPreLegalizeCombiner(bool IsOptNone)298 FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
299   return new AMDGPUPreLegalizerCombiner(IsOptNone);
300 }
301 } // end namespace llvm
302