1 //=== lib/CodeGen/GlobalISel/AArch64PreLegalizerCombiner.cpp --------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This pass does combining of machine instructions at the generic MI level,
10 // before the legalizer.
11 //
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64TargetMachine.h"
16 #include "llvm/CodeGen/GlobalISel/CSEInfo.h"
17 #include "llvm/CodeGen/GlobalISel/Combiner.h"
18 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
19 #include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
20 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
21 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineDominators.h"
26 #include "llvm/CodeGen/MachineFunction.h"
27 #include "llvm/CodeGen/MachineFunctionPass.h"
28 #include "llvm/CodeGen/MachineRegisterInfo.h"
29 #include "llvm/CodeGen/TargetPassConfig.h"
30 #include "llvm/IR/Instructions.h"
31 #include "llvm/Support/Debug.h"
32 
33 #define GET_GICOMBINER_DEPS
34 #include "AArch64GenPreLegalizeGICombiner.inc"
35 #undef GET_GICOMBINER_DEPS
36 
37 #define DEBUG_TYPE "aarch64-prelegalizer-combiner"
38 
39 using namespace llvm;
40 using namespace MIPatternMatch;
41 
42 namespace {
43 
44 #define GET_GICOMBINER_TYPES
45 #include "AArch64GenPreLegalizeGICombiner.inc"
46 #undef GET_GICOMBINER_TYPES
47 
48 /// Return true if a G_FCONSTANT instruction is known to be better-represented
49 /// as a G_CONSTANT.
50 bool matchFConstantToConstant(MachineInstr &MI, MachineRegisterInfo &MRI) {
51   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
52   Register DstReg = MI.getOperand(0).getReg();
53   const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
54   if (DstSize != 32 && DstSize != 64)
55     return false;
56 
57   // When we're storing a value, it doesn't matter what register bank it's on.
58   // Since not all floating point constants can be materialized using a fmov,
59   // it makes more sense to just use a GPR.
60   return all_of(MRI.use_nodbg_instructions(DstReg),
61                 [](const MachineInstr &Use) { return Use.mayStore(); });
62 }
63 
64 /// Change a G_FCONSTANT into a G_CONSTANT.
65 void applyFConstantToConstant(MachineInstr &MI) {
66   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
67   MachineIRBuilder MIB(MI);
68   const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
69   MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
70   MI.eraseFromParent();
71 }
72 
73 /// Try to match a G_ICMP of a G_TRUNC with zero, in which the truncated bits
74 /// are sign bits. In this case, we can transform the G_ICMP to directly compare
75 /// the wide value with a zero.
76 bool matchICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
77                              GISelKnownBits *KB, Register &MatchInfo) {
78   assert(MI.getOpcode() == TargetOpcode::G_ICMP && KB);
79 
80   auto Pred = (CmpInst::Predicate)MI.getOperand(1).getPredicate();
81   if (!ICmpInst::isEquality(Pred))
82     return false;
83 
84   Register LHS = MI.getOperand(2).getReg();
85   LLT LHSTy = MRI.getType(LHS);
86   if (!LHSTy.isScalar())
87     return false;
88 
89   Register RHS = MI.getOperand(3).getReg();
90   Register WideReg;
91 
92   if (!mi_match(LHS, MRI, m_GTrunc(m_Reg(WideReg))) ||
93       !mi_match(RHS, MRI, m_SpecificICst(0)))
94     return false;
95 
96   LLT WideTy = MRI.getType(WideReg);
97   if (KB->computeNumSignBits(WideReg) <=
98       WideTy.getSizeInBits() - LHSTy.getSizeInBits())
99     return false;
100 
101   MatchInfo = WideReg;
102   return true;
103 }
104 
105 void applyICmpRedundantTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
106                              MachineIRBuilder &Builder,
107                              GISelChangeObserver &Observer, Register &WideReg) {
108   assert(MI.getOpcode() == TargetOpcode::G_ICMP);
109 
110   LLT WideTy = MRI.getType(WideReg);
111   // We're going to directly use the wide register as the LHS, and then use an
112   // equivalent size zero for RHS.
113   Builder.setInstrAndDebugLoc(MI);
114   auto WideZero = Builder.buildConstant(WideTy, 0);
115   Observer.changingInstr(MI);
116   MI.getOperand(2).setReg(WideReg);
117   MI.getOperand(3).setReg(WideZero.getReg(0));
118   Observer.changedInstr(MI);
119 }
120 
121 /// \returns true if it is possible to fold a constant into a G_GLOBAL_VALUE.
122 ///
123 /// e.g.
124 ///
125 /// %g = G_GLOBAL_VALUE @x -> %g = G_GLOBAL_VALUE @x + cst
126 bool matchFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
127                            std::pair<uint64_t, uint64_t> &MatchInfo) {
128   assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
129   MachineFunction &MF = *MI.getMF();
130   auto &GlobalOp = MI.getOperand(1);
131   auto *GV = GlobalOp.getGlobal();
132   if (GV->isThreadLocal())
133     return false;
134 
135   // Don't allow anything that could represent offsets etc.
136   if (MF.getSubtarget<AArch64Subtarget>().ClassifyGlobalReference(
137           GV, MF.getTarget()) != AArch64II::MO_NO_FLAG)
138     return false;
139 
140   // Look for a G_GLOBAL_VALUE only used by G_PTR_ADDs against constants:
141   //
142   //  %g = G_GLOBAL_VALUE @x
143   //  %ptr1 = G_PTR_ADD %g, cst1
144   //  %ptr2 = G_PTR_ADD %g, cst2
145   //  ...
146   //  %ptrN = G_PTR_ADD %g, cstN
147   //
148   // Identify the *smallest* constant. We want to be able to form this:
149   //
150   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
151   //  %g = G_PTR_ADD %offset_g, -min_cst
152   //  %ptr1 = G_PTR_ADD %g, cst1
153   //  ...
154   Register Dst = MI.getOperand(0).getReg();
155   uint64_t MinOffset = -1ull;
156   for (auto &UseInstr : MRI.use_nodbg_instructions(Dst)) {
157     if (UseInstr.getOpcode() != TargetOpcode::G_PTR_ADD)
158       return false;
159     auto Cst = getIConstantVRegValWithLookThrough(
160         UseInstr.getOperand(2).getReg(), MRI);
161     if (!Cst)
162       return false;
163     MinOffset = std::min(MinOffset, Cst->Value.getZExtValue());
164   }
165 
166   // Require that the new offset is larger than the existing one to avoid
167   // infinite loops.
168   uint64_t CurrOffset = GlobalOp.getOffset();
169   uint64_t NewOffset = MinOffset + CurrOffset;
170   if (NewOffset <= CurrOffset)
171     return false;
172 
173   // Check whether folding this offset is legal. It must not go out of bounds of
174   // the referenced object to avoid violating the code model, and must be
175   // smaller than 2^20 because this is the largest offset expressible in all
176   // object formats. (The IMAGE_REL_ARM64_PAGEBASE_REL21 relocation in COFF
177   // stores an immediate signed 21 bit offset.)
178   //
179   // This check also prevents us from folding negative offsets, which will end
180   // up being treated in the same way as large positive ones. They could also
181   // cause code model violations, and aren't really common enough to matter.
182   if (NewOffset >= (1 << 20))
183     return false;
184 
185   Type *T = GV->getValueType();
186   if (!T->isSized() ||
187       NewOffset > GV->getParent()->getDataLayout().getTypeAllocSize(T))
188     return false;
189   MatchInfo = std::make_pair(NewOffset, MinOffset);
190   return true;
191 }
192 
193 void applyFoldGlobalOffset(MachineInstr &MI, MachineRegisterInfo &MRI,
194                            MachineIRBuilder &B, GISelChangeObserver &Observer,
195                            std::pair<uint64_t, uint64_t> &MatchInfo) {
196   // Change:
197   //
198   //  %g = G_GLOBAL_VALUE @x
199   //  %ptr1 = G_PTR_ADD %g, cst1
200   //  %ptr2 = G_PTR_ADD %g, cst2
201   //  ...
202   //  %ptrN = G_PTR_ADD %g, cstN
203   //
204   // To:
205   //
206   //  %offset_g = G_GLOBAL_VALUE @x + min_cst
207   //  %g = G_PTR_ADD %offset_g, -min_cst
208   //  %ptr1 = G_PTR_ADD %g, cst1
209   //  ...
210   //  %ptrN = G_PTR_ADD %g, cstN
211   //
212   // Then, the original G_PTR_ADDs should be folded later on so that they look
213   // like this:
214   //
215   //  %ptrN = G_PTR_ADD %offset_g, cstN - min_cst
216   uint64_t Offset, MinOffset;
217   std::tie(Offset, MinOffset) = MatchInfo;
218   B.setInstrAndDebugLoc(*std::next(MI.getIterator()));
219   Observer.changingInstr(MI);
220   auto &GlobalOp = MI.getOperand(1);
221   auto *GV = GlobalOp.getGlobal();
222   GlobalOp.ChangeToGA(GV, Offset, GlobalOp.getTargetFlags());
223   Register Dst = MI.getOperand(0).getReg();
224   Register NewGVDst = MRI.cloneVirtualRegister(Dst);
225   MI.getOperand(0).setReg(NewGVDst);
226   Observer.changedInstr(MI);
227   B.buildPtrAdd(
228       Dst, NewGVDst,
229       B.buildConstant(LLT::scalar(64), -static_cast<int64_t>(MinOffset)));
230 }
231 
232 // Combines vecreduce_add(mul(ext(x), ext(y))) -> vecreduce_add(udot(x, y))
233 // Or vecreduce_add(ext(x)) -> vecreduce_add(udot(x, 1))
234 // Similar to performVecReduceAddCombine in SelectionDAG
235 bool matchExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
236                             const AArch64Subtarget &STI,
237                             std::tuple<Register, Register, bool> &MatchInfo) {
238   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
239          "Expected a G_VECREDUCE_ADD instruction");
240   assert(STI.hasDotProd() && "Target should have Dot Product feature");
241 
242   MachineInstr *I1 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
243   Register DstReg = MI.getOperand(0).getReg();
244   Register MidReg = I1->getOperand(0).getReg();
245   LLT DstTy = MRI.getType(DstReg);
246   LLT MidTy = MRI.getType(MidReg);
247   if (DstTy.getScalarSizeInBits() != 32 || MidTy.getScalarSizeInBits() != 32)
248     return false;
249 
250   LLT SrcTy;
251   auto I1Opc = I1->getOpcode();
252   if (I1Opc == TargetOpcode::G_MUL) {
253     // If result of this has more than 1 use, then there is no point in creating
254     // udot instruction
255     if (!MRI.hasOneNonDBGUse(MidReg))
256       return false;
257 
258     MachineInstr *ExtMI1 =
259         getDefIgnoringCopies(I1->getOperand(1).getReg(), MRI);
260     MachineInstr *ExtMI2 =
261         getDefIgnoringCopies(I1->getOperand(2).getReg(), MRI);
262     LLT Ext1DstTy = MRI.getType(ExtMI1->getOperand(0).getReg());
263     LLT Ext2DstTy = MRI.getType(ExtMI2->getOperand(0).getReg());
264 
265     if (ExtMI1->getOpcode() != ExtMI2->getOpcode() || Ext1DstTy != Ext2DstTy)
266       return false;
267     I1Opc = ExtMI1->getOpcode();
268     SrcTy = MRI.getType(ExtMI1->getOperand(1).getReg());
269     std::get<0>(MatchInfo) = ExtMI1->getOperand(1).getReg();
270     std::get<1>(MatchInfo) = ExtMI2->getOperand(1).getReg();
271   } else {
272     SrcTy = MRI.getType(I1->getOperand(1).getReg());
273     std::get<0>(MatchInfo) = I1->getOperand(1).getReg();
274     std::get<1>(MatchInfo) = 0;
275   }
276 
277   if (I1Opc == TargetOpcode::G_ZEXT)
278     std::get<2>(MatchInfo) = 0;
279   else if (I1Opc == TargetOpcode::G_SEXT)
280     std::get<2>(MatchInfo) = 1;
281   else
282     return false;
283 
284   if (SrcTy.getScalarSizeInBits() != 8 || SrcTy.getNumElements() % 8 != 0)
285     return false;
286 
287   return true;
288 }
289 
290 void applyExtAddvToUdotAddv(MachineInstr &MI, MachineRegisterInfo &MRI,
291                             MachineIRBuilder &Builder,
292                             GISelChangeObserver &Observer,
293                             const AArch64Subtarget &STI,
294                             std::tuple<Register, Register, bool> &MatchInfo) {
295   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
296          "Expected a G_VECREDUCE_ADD instruction");
297   assert(STI.hasDotProd() && "Target should have Dot Product feature");
298 
299   // Initialise the variables
300   unsigned DotOpcode =
301       std::get<2>(MatchInfo) ? AArch64::G_SDOT : AArch64::G_UDOT;
302   Register Ext1SrcReg = std::get<0>(MatchInfo);
303 
304   // If there is one source register, create a vector of 0s as the second
305   // source register
306   Register Ext2SrcReg;
307   if (std::get<1>(MatchInfo) == 0)
308     Ext2SrcReg = Builder.buildConstant(MRI.getType(Ext1SrcReg), 1)
309                      ->getOperand(0)
310                      .getReg();
311   else
312     Ext2SrcReg = std::get<1>(MatchInfo);
313 
314   // Find out how many DOT instructions are needed
315   LLT SrcTy = MRI.getType(Ext1SrcReg);
316   LLT MidTy;
317   unsigned NumOfDotMI;
318   if (SrcTy.getNumElements() % 16 == 0) {
319     NumOfDotMI = SrcTy.getNumElements() / 16;
320     MidTy = LLT::fixed_vector(4, 32);
321   } else if (SrcTy.getNumElements() % 8 == 0) {
322     NumOfDotMI = SrcTy.getNumElements() / 8;
323     MidTy = LLT::fixed_vector(2, 32);
324   } else {
325     llvm_unreachable("Source type number of elements is not multiple of 8");
326   }
327 
328   // Handle case where one DOT instruction is needed
329   if (NumOfDotMI == 1) {
330     auto Zeroes = Builder.buildConstant(MidTy, 0)->getOperand(0).getReg();
331     auto Dot = Builder.buildInstr(DotOpcode, {MidTy},
332                                   {Zeroes, Ext1SrcReg, Ext2SrcReg});
333     Builder.buildVecReduceAdd(MI.getOperand(0), Dot->getOperand(0));
334   } else {
335     // If not pad the last v8 element with 0s to a v16
336     SmallVector<Register, 4> Ext1UnmergeReg;
337     SmallVector<Register, 4> Ext2UnmergeReg;
338     if (SrcTy.getNumElements() % 16 != 0) {
339       SmallVector<Register> Leftover1;
340       SmallVector<Register> Leftover2;
341 
342       // Split the elements into v16i8 and v8i8
343       LLT MainTy = LLT::fixed_vector(16, 8);
344       LLT LeftoverTy1, LeftoverTy2;
345       if ((!extractParts(Ext1SrcReg, MRI.getType(Ext1SrcReg), MainTy,
346                          LeftoverTy1, Ext1UnmergeReg, Leftover1, Builder,
347                          MRI)) ||
348           (!extractParts(Ext2SrcReg, MRI.getType(Ext2SrcReg), MainTy,
349                          LeftoverTy2, Ext2UnmergeReg, Leftover2, Builder,
350                          MRI))) {
351         llvm_unreachable("Unable to split this vector properly");
352       }
353 
354       // Pad the leftover v8i8 vector with register of 0s of type v8i8
355       Register v8Zeroes = Builder.buildConstant(LLT::fixed_vector(8, 8), 0)
356                               ->getOperand(0)
357                               .getReg();
358 
359       Ext1UnmergeReg.push_back(
360           Builder
361               .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
362                                    {Leftover1[0], v8Zeroes})
363               .getReg(0));
364       Ext2UnmergeReg.push_back(
365           Builder
366               .buildMergeLikeInstr(LLT::fixed_vector(16, 8),
367                                    {Leftover2[0], v8Zeroes})
368               .getReg(0));
369 
370     } else {
371       // Unmerge the source vectors to v16i8
372       unsigned SrcNumElts = SrcTy.getNumElements();
373       extractParts(Ext1SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
374                    Ext1UnmergeReg, Builder, MRI);
375       extractParts(Ext2SrcReg, LLT::fixed_vector(16, 8), SrcNumElts / 16,
376                    Ext2UnmergeReg, Builder, MRI);
377     }
378 
379     // Build the UDOT instructions
380     SmallVector<Register, 2> DotReg;
381     unsigned NumElements = 0;
382     for (unsigned i = 0; i < Ext1UnmergeReg.size(); i++) {
383       LLT ZeroesLLT;
384       // Check if it is 16 or 8 elements. Set Zeroes to the according size
385       if (MRI.getType(Ext1UnmergeReg[i]).getNumElements() == 16) {
386         ZeroesLLT = LLT::fixed_vector(4, 32);
387         NumElements += 4;
388       } else {
389         ZeroesLLT = LLT::fixed_vector(2, 32);
390         NumElements += 2;
391       }
392       auto Zeroes = Builder.buildConstant(ZeroesLLT, 0)->getOperand(0).getReg();
393       DotReg.push_back(
394           Builder
395               .buildInstr(DotOpcode, {MRI.getType(Zeroes)},
396                           {Zeroes, Ext1UnmergeReg[i], Ext2UnmergeReg[i]})
397               .getReg(0));
398     }
399 
400     // Merge the output
401     auto ConcatMI =
402         Builder.buildConcatVectors(LLT::fixed_vector(NumElements, 32), DotReg);
403 
404     // Put it through a vector reduction
405     Builder.buildVecReduceAdd(MI.getOperand(0).getReg(),
406                               ConcatMI->getOperand(0).getReg());
407   }
408 
409   // Erase the dead instructions
410   MI.eraseFromParent();
411 }
412 
413 // Matches {U/S}ADDV(ext(x)) => {U/S}ADDLV(x)
414 // Ensure that the type coming from the extend instruction is the right size
415 bool matchExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
416                            std::pair<Register, bool> &MatchInfo) {
417   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
418          "Expected G_VECREDUCE_ADD Opcode");
419 
420   // Check if the last instruction is an extend
421   MachineInstr *ExtMI = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
422   auto ExtOpc = ExtMI->getOpcode();
423 
424   if (ExtOpc == TargetOpcode::G_ZEXT)
425     std::get<1>(MatchInfo) = 0;
426   else if (ExtOpc == TargetOpcode::G_SEXT)
427     std::get<1>(MatchInfo) = 1;
428   else
429     return false;
430 
431   // Check if the source register is a valid type
432   Register ExtSrcReg = ExtMI->getOperand(1).getReg();
433   LLT ExtSrcTy = MRI.getType(ExtSrcReg);
434   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
435   if ((DstTy.getScalarSizeInBits() == 16 &&
436        ExtSrcTy.getNumElements() % 8 == 0 && ExtSrcTy.getNumElements() < 256) ||
437       (DstTy.getScalarSizeInBits() == 32 &&
438        ExtSrcTy.getNumElements() % 4 == 0) ||
439       (DstTy.getScalarSizeInBits() == 64 &&
440        ExtSrcTy.getNumElements() % 4 == 0)) {
441     std::get<0>(MatchInfo) = ExtSrcReg;
442     return true;
443   }
444   return false;
445 }
446 
447 void applyExtUaddvToUaddlv(MachineInstr &MI, MachineRegisterInfo &MRI,
448                            MachineIRBuilder &B, GISelChangeObserver &Observer,
449                            std::pair<Register, bool> &MatchInfo) {
450   assert(MI.getOpcode() == TargetOpcode::G_VECREDUCE_ADD &&
451          "Expected G_VECREDUCE_ADD Opcode");
452 
453   unsigned Opc = std::get<1>(MatchInfo) ? AArch64::G_SADDLV : AArch64::G_UADDLV;
454   Register SrcReg = std::get<0>(MatchInfo);
455   Register DstReg = MI.getOperand(0).getReg();
456   LLT SrcTy = MRI.getType(SrcReg);
457   LLT DstTy = MRI.getType(DstReg);
458 
459   // If SrcTy has more elements than expected, split them into multiple
460   // insructions and sum the results
461   LLT MainTy;
462   SmallVector<Register, 1> WorkingRegisters;
463   unsigned SrcScalSize = SrcTy.getScalarSizeInBits();
464   unsigned SrcNumElem = SrcTy.getNumElements();
465   if ((SrcScalSize == 8 && SrcNumElem > 16) ||
466       (SrcScalSize == 16 && SrcNumElem > 8) ||
467       (SrcScalSize == 32 && SrcNumElem > 4)) {
468 
469     LLT LeftoverTy;
470     SmallVector<Register, 4> LeftoverRegs;
471     if (SrcScalSize == 8)
472       MainTy = LLT::fixed_vector(16, 8);
473     else if (SrcScalSize == 16)
474       MainTy = LLT::fixed_vector(8, 16);
475     else if (SrcScalSize == 32)
476       MainTy = LLT::fixed_vector(4, 32);
477     else
478       llvm_unreachable("Source's Scalar Size not supported");
479 
480     // Extract the parts and put each extracted sources through U/SADDLV and put
481     // the values inside a small vec
482     extractParts(SrcReg, SrcTy, MainTy, LeftoverTy, WorkingRegisters,
483                  LeftoverRegs, B, MRI);
484     for (unsigned I = 0; I < LeftoverRegs.size(); I++) {
485       WorkingRegisters.push_back(LeftoverRegs[I]);
486     }
487   } else {
488     WorkingRegisters.push_back(SrcReg);
489     MainTy = SrcTy;
490   }
491 
492   unsigned MidScalarSize = MainTy.getScalarSizeInBits() * 2;
493   LLT MidScalarLLT = LLT::scalar(MidScalarSize);
494   Register zeroReg = B.buildConstant(LLT::scalar(64), 0).getReg(0);
495   for (unsigned I = 0; I < WorkingRegisters.size(); I++) {
496     // If the number of elements is too small to build an instruction, extend
497     // its size before applying addlv
498     LLT WorkingRegTy = MRI.getType(WorkingRegisters[I]);
499     if ((WorkingRegTy.getScalarSizeInBits() == 8) &&
500         (WorkingRegTy.getNumElements() == 4)) {
501       WorkingRegisters[I] =
502           B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
503                                               : TargetOpcode::G_ZEXT,
504                        {LLT::fixed_vector(4, 16)}, {WorkingRegisters[I]})
505               .getReg(0);
506     }
507 
508     // Generate the {U/S}ADDLV instruction, whose output is always double of the
509     // Src's Scalar size
510     LLT addlvTy = MidScalarSize <= 32 ? LLT::fixed_vector(4, 32)
511                                       : LLT::fixed_vector(2, 64);
512     Register addlvReg =
513         B.buildInstr(Opc, {addlvTy}, {WorkingRegisters[I]}).getReg(0);
514 
515     // The output from {U/S}ADDLV gets placed in the lowest lane of a v4i32 or
516     // v2i64 register.
517     //     i16, i32 results uses v4i32 registers
518     //     i64      results uses v2i64 registers
519     // Therefore we have to extract/truncate the the value to the right type
520     if (MidScalarSize == 32 || MidScalarSize == 64) {
521       WorkingRegisters[I] = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
522                                          {MidScalarLLT}, {addlvReg, zeroReg})
523                                 .getReg(0);
524     } else {
525       Register extractReg = B.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT,
526                                          {LLT::scalar(32)}, {addlvReg, zeroReg})
527                                 .getReg(0);
528       WorkingRegisters[I] =
529           B.buildTrunc({MidScalarLLT}, {extractReg}).getReg(0);
530     }
531   }
532 
533   Register outReg;
534   if (WorkingRegisters.size() > 1) {
535     outReg = B.buildAdd(MidScalarLLT, WorkingRegisters[0], WorkingRegisters[1])
536                  .getReg(0);
537     for (unsigned I = 2; I < WorkingRegisters.size(); I++) {
538       outReg = B.buildAdd(MidScalarLLT, outReg, WorkingRegisters[I]).getReg(0);
539     }
540   } else {
541     outReg = WorkingRegisters[0];
542   }
543 
544   if (DstTy.getScalarSizeInBits() > MidScalarSize) {
545     // Handle the scalar value if the DstTy's Scalar Size is more than double
546     // Src's ScalarType
547     B.buildInstr(std::get<1>(MatchInfo) ? TargetOpcode::G_SEXT
548                                         : TargetOpcode::G_ZEXT,
549                  {DstReg}, {outReg});
550   } else {
551     B.buildCopy(DstReg, outReg);
552   }
553 
554   MI.eraseFromParent();
555 }
556 
557 bool tryToSimplifyUADDO(MachineInstr &MI, MachineIRBuilder &B,
558                         CombinerHelper &Helper, GISelChangeObserver &Observer) {
559   // Try simplify G_UADDO with 8 or 16 bit operands to wide G_ADD and TBNZ if
560   // result is only used in the no-overflow case. It is restricted to cases
561   // where we know that the high-bits of the operands are 0. If there's an
562   // overflow, then the 9th or 17th bit must be set, which can be checked
563   // using TBNZ.
564   //
565   // Change (for UADDOs on 8 and 16 bits):
566   //
567   //   %z0 = G_ASSERT_ZEXT _
568   //   %op0 = G_TRUNC %z0
569   //   %z1 = G_ASSERT_ZEXT _
570   //   %op1 = G_TRUNC %z1
571   //   %val, %cond = G_UADDO %op0, %op1
572   //   G_BRCOND %cond, %error.bb
573   //
574   // error.bb:
575   //   (no successors and no uses of %val)
576   //
577   // To:
578   //
579   //   %z0 = G_ASSERT_ZEXT _
580   //   %z1 = G_ASSERT_ZEXT _
581   //   %add = G_ADD %z0, %z1
582   //   %val = G_TRUNC %add
583   //   %bit = G_AND %add, 1 << scalar-size-in-bits(%op1)
584   //   %cond = G_ICMP NE, %bit, 0
585   //   G_BRCOND %cond, %error.bb
586 
587   auto &MRI = *B.getMRI();
588 
589   MachineOperand *DefOp0 = MRI.getOneDef(MI.getOperand(2).getReg());
590   MachineOperand *DefOp1 = MRI.getOneDef(MI.getOperand(3).getReg());
591   Register Op0Wide;
592   Register Op1Wide;
593   if (!mi_match(DefOp0->getParent(), MRI, m_GTrunc(m_Reg(Op0Wide))) ||
594       !mi_match(DefOp1->getParent(), MRI, m_GTrunc(m_Reg(Op1Wide))))
595     return false;
596   LLT WideTy0 = MRI.getType(Op0Wide);
597   LLT WideTy1 = MRI.getType(Op1Wide);
598   Register ResVal = MI.getOperand(0).getReg();
599   LLT OpTy = MRI.getType(ResVal);
600   MachineInstr *Op0WideDef = MRI.getVRegDef(Op0Wide);
601   MachineInstr *Op1WideDef = MRI.getVRegDef(Op1Wide);
602 
603   unsigned OpTySize = OpTy.getScalarSizeInBits();
604   // First check that the G_TRUNC feeding the G_UADDO are no-ops, because the
605   // inputs have been zero-extended.
606   if (Op0WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
607       Op1WideDef->getOpcode() != TargetOpcode::G_ASSERT_ZEXT ||
608       OpTySize != Op0WideDef->getOperand(2).getImm() ||
609       OpTySize != Op1WideDef->getOperand(2).getImm())
610     return false;
611 
612   // Only scalar UADDO with either 8 or 16 bit operands are handled.
613   if (!WideTy0.isScalar() || !WideTy1.isScalar() || WideTy0 != WideTy1 ||
614       OpTySize >= WideTy0.getScalarSizeInBits() ||
615       (OpTySize != 8 && OpTySize != 16))
616     return false;
617 
618   // The overflow-status result must be used by a branch only.
619   Register ResStatus = MI.getOperand(1).getReg();
620   if (!MRI.hasOneNonDBGUse(ResStatus))
621     return false;
622   MachineInstr *CondUser = &*MRI.use_instr_nodbg_begin(ResStatus);
623   if (CondUser->getOpcode() != TargetOpcode::G_BRCOND)
624     return false;
625 
626   // Make sure the computed result is only used in the no-overflow blocks.
627   MachineBasicBlock *CurrentMBB = MI.getParent();
628   MachineBasicBlock *FailMBB = CondUser->getOperand(1).getMBB();
629   if (!FailMBB->succ_empty() || CondUser->getParent() != CurrentMBB)
630     return false;
631   if (any_of(MRI.use_nodbg_instructions(ResVal),
632              [&MI, FailMBB, CurrentMBB](MachineInstr &I) {
633                return &MI != &I &&
634                       (I.getParent() == FailMBB || I.getParent() == CurrentMBB);
635              }))
636     return false;
637 
638   // Remove G_ADDO.
639   B.setInstrAndDebugLoc(*MI.getNextNode());
640   MI.eraseFromParent();
641 
642   // Emit wide add.
643   Register AddDst = MRI.cloneVirtualRegister(Op0Wide);
644   B.buildInstr(TargetOpcode::G_ADD, {AddDst}, {Op0Wide, Op1Wide});
645 
646   // Emit check of the 9th or 17th bit and update users (the branch). This will
647   // later be folded to TBNZ.
648   Register CondBit = MRI.cloneVirtualRegister(Op0Wide);
649   B.buildAnd(
650       CondBit, AddDst,
651       B.buildConstant(LLT::scalar(32), OpTySize == 8 ? 1 << 8 : 1 << 16));
652   B.buildICmp(CmpInst::ICMP_NE, ResStatus, CondBit,
653               B.buildConstant(LLT::scalar(32), 0));
654 
655   // Update ZEXts users of the result value. Because all uses are in the
656   // no-overflow case, we know that the top bits are 0 and we can ignore ZExts.
657   B.buildZExtOrTrunc(ResVal, AddDst);
658   for (MachineOperand &U : make_early_inc_range(MRI.use_operands(ResVal))) {
659     Register WideReg;
660     if (mi_match(U.getParent(), MRI, m_GZExt(m_Reg(WideReg)))) {
661       auto OldR = U.getParent()->getOperand(0).getReg();
662       Observer.erasingInstr(*U.getParent());
663       U.getParent()->eraseFromParent();
664       Helper.replaceRegWith(MRI, OldR, AddDst);
665     }
666   }
667 
668   return true;
669 }
670 
671 class AArch64PreLegalizerCombinerImpl : public Combiner {
672 protected:
673   // TODO: Make CombinerHelper methods const.
674   mutable CombinerHelper Helper;
675   const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig;
676   const AArch64Subtarget &STI;
677 
678 public:
679   AArch64PreLegalizerCombinerImpl(
680       MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
681       GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
682       const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
683       const AArch64Subtarget &STI, MachineDominatorTree *MDT,
684       const LegalizerInfo *LI);
685 
686   static const char *getName() { return "AArch6400PreLegalizerCombiner"; }
687 
688   bool tryCombineAll(MachineInstr &I) const override;
689 
690   bool tryCombineAllImpl(MachineInstr &I) const;
691 
692 private:
693 #define GET_GICOMBINER_CLASS_MEMBERS
694 #include "AArch64GenPreLegalizeGICombiner.inc"
695 #undef GET_GICOMBINER_CLASS_MEMBERS
696 };
697 
698 #define GET_GICOMBINER_IMPL
699 #include "AArch64GenPreLegalizeGICombiner.inc"
700 #undef GET_GICOMBINER_IMPL
701 
702 AArch64PreLegalizerCombinerImpl::AArch64PreLegalizerCombinerImpl(
703     MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
704     GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
705     const AArch64PreLegalizerCombinerImplRuleConfig &RuleConfig,
706     const AArch64Subtarget &STI, MachineDominatorTree *MDT,
707     const LegalizerInfo *LI)
708     : Combiner(MF, CInfo, TPC, &KB, CSEInfo),
709       Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
710       RuleConfig(RuleConfig), STI(STI),
711 #define GET_GICOMBINER_CONSTRUCTOR_INITS
712 #include "AArch64GenPreLegalizeGICombiner.inc"
713 #undef GET_GICOMBINER_CONSTRUCTOR_INITS
714 {
715 }
716 
717 bool AArch64PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
718   if (tryCombineAllImpl(MI))
719     return true;
720 
721   unsigned Opc = MI.getOpcode();
722   switch (Opc) {
723   case TargetOpcode::G_CONCAT_VECTORS:
724     return Helper.tryCombineConcatVectors(MI);
725   case TargetOpcode::G_SHUFFLE_VECTOR:
726     return Helper.tryCombineShuffleVector(MI);
727   case TargetOpcode::G_UADDO:
728     return tryToSimplifyUADDO(MI, B, Helper, Observer);
729   case TargetOpcode::G_MEMCPY_INLINE:
730     return Helper.tryEmitMemcpyInline(MI);
731   case TargetOpcode::G_MEMCPY:
732   case TargetOpcode::G_MEMMOVE:
733   case TargetOpcode::G_MEMSET: {
734     // If we're at -O0 set a maxlen of 32 to inline, otherwise let the other
735     // heuristics decide.
736     unsigned MaxLen = CInfo.EnableOpt ? 0 : 32;
737     // Try to inline memcpy type calls if optimizations are enabled.
738     if (Helper.tryCombineMemCpyFamily(MI, MaxLen))
739       return true;
740     if (Opc == TargetOpcode::G_MEMSET)
741       return llvm::AArch64GISelUtils::tryEmitBZero(MI, B, CInfo.EnableMinSize);
742     return false;
743   }
744   }
745 
746   return false;
747 }
748 
749 // Pass boilerplate
750 // ================
751 
752 class AArch64PreLegalizerCombiner : public MachineFunctionPass {
753 public:
754   static char ID;
755 
756   AArch64PreLegalizerCombiner();
757 
758   StringRef getPassName() const override {
759     return "AArch64PreLegalizerCombiner";
760   }
761 
762   bool runOnMachineFunction(MachineFunction &MF) override;
763 
764   void getAnalysisUsage(AnalysisUsage &AU) const override;
765 
766 private:
767   AArch64PreLegalizerCombinerImplRuleConfig RuleConfig;
768 };
769 } // end anonymous namespace
770 
771 void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
772   AU.addRequired<TargetPassConfig>();
773   AU.setPreservesCFG();
774   getSelectionDAGFallbackAnalysisUsage(AU);
775   AU.addRequired<GISelKnownBitsAnalysis>();
776   AU.addPreserved<GISelKnownBitsAnalysis>();
777   AU.addRequired<MachineDominatorTree>();
778   AU.addPreserved<MachineDominatorTree>();
779   AU.addRequired<GISelCSEAnalysisWrapperPass>();
780   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
781   MachineFunctionPass::getAnalysisUsage(AU);
782 }
783 
784 AArch64PreLegalizerCombiner::AArch64PreLegalizerCombiner()
785     : MachineFunctionPass(ID) {
786   initializeAArch64PreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
787 
788   if (!RuleConfig.parseCommandLineOption())
789     report_fatal_error("Invalid rule identifier");
790 }
791 
792 bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
793   if (MF.getProperties().hasProperty(
794           MachineFunctionProperties::Property::FailedISel))
795     return false;
796   auto &TPC = getAnalysis<TargetPassConfig>();
797 
798   // Enable CSE.
799   GISelCSEAnalysisWrapper &Wrapper =
800       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
801   auto *CSEInfo = &Wrapper.get(TPC.getCSEConfig());
802 
803   const AArch64Subtarget &ST = MF.getSubtarget<AArch64Subtarget>();
804   const auto *LI = ST.getLegalizerInfo();
805 
806   const Function &F = MF.getFunction();
807   bool EnableOpt =
808       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
809   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
810   MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
811   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
812                      /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
813                      F.hasMinSize());
814   AArch64PreLegalizerCombinerImpl Impl(MF, CInfo, &TPC, *KB, CSEInfo,
815                                        RuleConfig, ST, MDT, LI);
816   return Impl.combineMachineInstrs();
817 }
818 
819 char AArch64PreLegalizerCombiner::ID = 0;
820 INITIALIZE_PASS_BEGIN(AArch64PreLegalizerCombiner, DEBUG_TYPE,
821                       "Combine AArch64 machine instrs before legalization",
822                       false, false)
823 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
824 INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
825 INITIALIZE_PASS_DEPENDENCY(GISelCSEAnalysisWrapperPass)
826 INITIALIZE_PASS_END(AArch64PreLegalizerCombiner, DEBUG_TYPE,
827                     "Combine AArch64 machine instrs before legalization", false,
828                     false)
829 
830 namespace llvm {
831 FunctionPass *createAArch64PreLegalizerCombiner() {
832   return new AArch64PreLegalizerCombiner();
833 }
834 } // end namespace llvm
835