1 //===-- llvm/CodeGen/GlobalISel/LegalizerHelper.cpp -----------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 /// \file This file implements the LegalizerHelper class to legalize
10 /// individual instructions and the LegalizeMachineIR wrapper pass for the
11 /// primary legalization.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
16 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
17 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
18 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
19 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
20 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
21 #include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
22 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
23 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
24 #include "llvm/CodeGen/GlobalISel/Utils.h"
25 #include "llvm/CodeGen/MachineConstantPool.h"
26 #include "llvm/CodeGen/MachineFrameInfo.h"
27 #include "llvm/CodeGen/MachineRegisterInfo.h"
28 #include "llvm/CodeGen/RuntimeLibcalls.h"
29 #include "llvm/CodeGen/TargetFrameLowering.h"
30 #include "llvm/CodeGen/TargetInstrInfo.h"
31 #include "llvm/CodeGen/TargetLowering.h"
32 #include "llvm/CodeGen/TargetOpcodes.h"
33 #include "llvm/CodeGen/TargetSubtargetInfo.h"
34 #include "llvm/IR/Instructions.h"
35 #include "llvm/Support/Debug.h"
36 #include "llvm/Support/MathExtras.h"
37 #include "llvm/Support/raw_ostream.h"
38 #include "llvm/Target/TargetMachine.h"
39 #include <numeric>
40 #include <optional>
41 
42 #define DEBUG_TYPE "legalizer"
43 
44 using namespace llvm;
45 using namespace LegalizeActions;
46 using namespace MIPatternMatch;
47 
48 /// Try to break down \p OrigTy into \p NarrowTy sized pieces.
49 ///
50 /// Returns the number of \p NarrowTy elements needed to reconstruct \p OrigTy,
51 /// with any leftover piece as type \p LeftoverTy
52 ///
53 /// Returns -1 in the first element of the pair if the breakdown is not
54 /// satisfiable.
55 static std::pair<int, int>
56 getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
57   assert(!LeftoverTy.isValid() && "this is an out argument");
58 
59   unsigned Size = OrigTy.getSizeInBits();
60   unsigned NarrowSize = NarrowTy.getSizeInBits();
61   unsigned NumParts = Size / NarrowSize;
62   unsigned LeftoverSize = Size - NumParts * NarrowSize;
63   assert(Size > NarrowSize);
64 
65   if (LeftoverSize == 0)
66     return {NumParts, 0};
67 
68   if (NarrowTy.isVector()) {
69     unsigned EltSize = OrigTy.getScalarSizeInBits();
70     if (LeftoverSize % EltSize != 0)
71       return {-1, -1};
72     LeftoverTy = LLT::scalarOrVector(
73         ElementCount::getFixed(LeftoverSize / EltSize), EltSize);
74   } else {
75     LeftoverTy = LLT::scalar(LeftoverSize);
76   }
77 
78   int NumLeftover = LeftoverSize / LeftoverTy.getSizeInBits();
79   return std::make_pair(NumParts, NumLeftover);
80 }
81 
82 static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
83 
84   if (!Ty.isScalar())
85     return nullptr;
86 
87   switch (Ty.getSizeInBits()) {
88   case 16:
89     return Type::getHalfTy(Ctx);
90   case 32:
91     return Type::getFloatTy(Ctx);
92   case 64:
93     return Type::getDoubleTy(Ctx);
94   case 80:
95     return Type::getX86_FP80Ty(Ctx);
96   case 128:
97     return Type::getFP128Ty(Ctx);
98   default:
99     return nullptr;
100   }
101 }
102 
103 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
104                                  GISelChangeObserver &Observer,
105                                  MachineIRBuilder &Builder)
106     : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
107       LI(*MF.getSubtarget().getLegalizerInfo()),
108       TLI(*MF.getSubtarget().getTargetLowering()), KB(nullptr) {}
109 
110 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
111                                  GISelChangeObserver &Observer,
112                                  MachineIRBuilder &B, GISelKnownBits *KB)
113     : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI),
114       TLI(*MF.getSubtarget().getTargetLowering()), KB(KB) {}
115 
116 LegalizerHelper::LegalizeResult
117 LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
118                                    LostDebugLocObserver &LocObserver) {
119   LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
120 
121   MIRBuilder.setInstrAndDebugLoc(MI);
122 
123   if (isa<GIntrinsic>(MI))
124     return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
125   auto Step = LI.getAction(MI, MRI);
126   switch (Step.Action) {
127   case Legal:
128     LLVM_DEBUG(dbgs() << ".. Already legal\n");
129     return AlreadyLegal;
130   case Libcall:
131     LLVM_DEBUG(dbgs() << ".. Convert to libcall\n");
132     return libcall(MI, LocObserver);
133   case NarrowScalar:
134     LLVM_DEBUG(dbgs() << ".. Narrow scalar\n");
135     return narrowScalar(MI, Step.TypeIdx, Step.NewType);
136   case WidenScalar:
137     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
138     return widenScalar(MI, Step.TypeIdx, Step.NewType);
139   case Bitcast:
140     LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
141     return bitcast(MI, Step.TypeIdx, Step.NewType);
142   case Lower:
143     LLVM_DEBUG(dbgs() << ".. Lower\n");
144     return lower(MI, Step.TypeIdx, Step.NewType);
145   case FewerElements:
146     LLVM_DEBUG(dbgs() << ".. Reduce number of elements\n");
147     return fewerElementsVector(MI, Step.TypeIdx, Step.NewType);
148   case MoreElements:
149     LLVM_DEBUG(dbgs() << ".. Increase number of elements\n");
150     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
151   case Custom:
152     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
153     return LI.legalizeCustom(*this, MI, LocObserver) ? Legalized
154                                                      : UnableToLegalize;
155   default:
156     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
157     return UnableToLegalize;
158   }
159 }
160 
161 void LegalizerHelper::insertParts(Register DstReg,
162                                   LLT ResultTy, LLT PartTy,
163                                   ArrayRef<Register> PartRegs,
164                                   LLT LeftoverTy,
165                                   ArrayRef<Register> LeftoverRegs) {
166   if (!LeftoverTy.isValid()) {
167     assert(LeftoverRegs.empty());
168 
169     if (!ResultTy.isVector()) {
170       MIRBuilder.buildMergeLikeInstr(DstReg, PartRegs);
171       return;
172     }
173 
174     if (PartTy.isVector())
175       MIRBuilder.buildConcatVectors(DstReg, PartRegs);
176     else
177       MIRBuilder.buildBuildVector(DstReg, PartRegs);
178     return;
179   }
180 
181   // Merge sub-vectors with different number of elements and insert into DstReg.
182   if (ResultTy.isVector()) {
183     assert(LeftoverRegs.size() == 1 && "Expected one leftover register");
184     SmallVector<Register, 8> AllRegs;
185     for (auto Reg : concat<const Register>(PartRegs, LeftoverRegs))
186       AllRegs.push_back(Reg);
187     return mergeMixedSubvectors(DstReg, AllRegs);
188   }
189 
190   SmallVector<Register> GCDRegs;
191   LLT GCDTy = getGCDType(getGCDType(ResultTy, LeftoverTy), PartTy);
192   for (auto PartReg : concat<const Register>(PartRegs, LeftoverRegs))
193     extractGCDType(GCDRegs, GCDTy, PartReg);
194   LLT ResultLCMTy = buildLCMMergePieces(ResultTy, LeftoverTy, GCDTy, GCDRegs);
195   buildWidenedRemergeToDst(DstReg, ResultLCMTy, GCDRegs);
196 }
197 
198 void LegalizerHelper::appendVectorElts(SmallVectorImpl<Register> &Elts,
199                                        Register Reg) {
200   LLT Ty = MRI.getType(Reg);
201   SmallVector<Register, 8> RegElts;
202   extractParts(Reg, Ty.getScalarType(), Ty.getNumElements(), RegElts,
203                MIRBuilder, MRI);
204   Elts.append(RegElts);
205 }
206 
207 /// Merge \p PartRegs with different types into \p DstReg.
208 void LegalizerHelper::mergeMixedSubvectors(Register DstReg,
209                                            ArrayRef<Register> PartRegs) {
210   SmallVector<Register, 8> AllElts;
211   for (unsigned i = 0; i < PartRegs.size() - 1; ++i)
212     appendVectorElts(AllElts, PartRegs[i]);
213 
214   Register Leftover = PartRegs[PartRegs.size() - 1];
215   if (MRI.getType(Leftover).isScalar())
216     AllElts.push_back(Leftover);
217   else
218     appendVectorElts(AllElts, Leftover);
219 
220   MIRBuilder.buildMergeLikeInstr(DstReg, AllElts);
221 }
222 
223 /// Append the result registers of G_UNMERGE_VALUES \p MI to \p Regs.
224 static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
225                               const MachineInstr &MI) {
226   assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
227 
228   const int StartIdx = Regs.size();
229   const int NumResults = MI.getNumOperands() - 1;
230   Regs.resize(Regs.size() + NumResults);
231   for (int I = 0; I != NumResults; ++I)
232     Regs[StartIdx + I] = MI.getOperand(I).getReg();
233 }
234 
235 void LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts,
236                                      LLT GCDTy, Register SrcReg) {
237   LLT SrcTy = MRI.getType(SrcReg);
238   if (SrcTy == GCDTy) {
239     // If the source already evenly divides the result type, we don't need to do
240     // anything.
241     Parts.push_back(SrcReg);
242   } else {
243     // Need to split into common type sized pieces.
244     auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
245     getUnmergeResults(Parts, *Unmerge);
246   }
247 }
248 
249 LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
250                                     LLT NarrowTy, Register SrcReg) {
251   LLT SrcTy = MRI.getType(SrcReg);
252   LLT GCDTy = getGCDType(getGCDType(SrcTy, NarrowTy), DstTy);
253   extractGCDType(Parts, GCDTy, SrcReg);
254   return GCDTy;
255 }
256 
257 LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
258                                          SmallVectorImpl<Register> &VRegs,
259                                          unsigned PadStrategy) {
260   LLT LCMTy = getLCMType(DstTy, NarrowTy);
261 
262   int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
263   int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
264   int NumOrigSrc = VRegs.size();
265 
266   Register PadReg;
267 
268   // Get a value we can use to pad the source value if the sources won't evenly
269   // cover the result type.
270   if (NumOrigSrc < NumParts * NumSubParts) {
271     if (PadStrategy == TargetOpcode::G_ZEXT)
272       PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
273     else if (PadStrategy == TargetOpcode::G_ANYEXT)
274       PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
275     else {
276       assert(PadStrategy == TargetOpcode::G_SEXT);
277 
278       // Shift the sign bit of the low register through the high register.
279       auto ShiftAmt =
280         MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
281       PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
282     }
283   }
284 
285   // Registers for the final merge to be produced.
286   SmallVector<Register, 4> Remerge(NumParts);
287 
288   // Registers needed for intermediate merges, which will be merged into a
289   // source for Remerge.
290   SmallVector<Register, 4> SubMerge(NumSubParts);
291 
292   // Once we've fully read off the end of the original source bits, we can reuse
293   // the same high bits for remaining padding elements.
294   Register AllPadReg;
295 
296   // Build merges to the LCM type to cover the original result type.
297   for (int I = 0; I != NumParts; ++I) {
298     bool AllMergePartsArePadding = true;
299 
300     // Build the requested merges to the requested type.
301     for (int J = 0; J != NumSubParts; ++J) {
302       int Idx = I * NumSubParts + J;
303       if (Idx >= NumOrigSrc) {
304         SubMerge[J] = PadReg;
305         continue;
306       }
307 
308       SubMerge[J] = VRegs[Idx];
309 
310       // There are meaningful bits here we can't reuse later.
311       AllMergePartsArePadding = false;
312     }
313 
314     // If we've filled up a complete piece with padding bits, we can directly
315     // emit the natural sized constant if applicable, rather than a merge of
316     // smaller constants.
317     if (AllMergePartsArePadding && !AllPadReg) {
318       if (PadStrategy == TargetOpcode::G_ANYEXT)
319         AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
320       else if (PadStrategy == TargetOpcode::G_ZEXT)
321         AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
322 
323       // If this is a sign extension, we can't materialize a trivial constant
324       // with the right type and have to produce a merge.
325     }
326 
327     if (AllPadReg) {
328       // Avoid creating additional instructions if we're just adding additional
329       // copies of padding bits.
330       Remerge[I] = AllPadReg;
331       continue;
332     }
333 
334     if (NumSubParts == 1)
335       Remerge[I] = SubMerge[0];
336     else
337       Remerge[I] = MIRBuilder.buildMergeLikeInstr(NarrowTy, SubMerge).getReg(0);
338 
339     // In the sign extend padding case, re-use the first all-signbit merge.
340     if (AllMergePartsArePadding && !AllPadReg)
341       AllPadReg = Remerge[I];
342   }
343 
344   VRegs = std::move(Remerge);
345   return LCMTy;
346 }
347 
348 void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
349                                                ArrayRef<Register> RemergeRegs) {
350   LLT DstTy = MRI.getType(DstReg);
351 
352   // Create the merge to the widened source, and extract the relevant bits into
353   // the result.
354 
355   if (DstTy == LCMTy) {
356     MIRBuilder.buildMergeLikeInstr(DstReg, RemergeRegs);
357     return;
358   }
359 
360   auto Remerge = MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs);
361   if (DstTy.isScalar() && LCMTy.isScalar()) {
362     MIRBuilder.buildTrunc(DstReg, Remerge);
363     return;
364   }
365 
366   if (LCMTy.isVector()) {
367     unsigned NumDefs = LCMTy.getSizeInBits() / DstTy.getSizeInBits();
368     SmallVector<Register, 8> UnmergeDefs(NumDefs);
369     UnmergeDefs[0] = DstReg;
370     for (unsigned I = 1; I != NumDefs; ++I)
371       UnmergeDefs[I] = MRI.createGenericVirtualRegister(DstTy);
372 
373     MIRBuilder.buildUnmerge(UnmergeDefs,
374                             MIRBuilder.buildMergeLikeInstr(LCMTy, RemergeRegs));
375     return;
376   }
377 
378   llvm_unreachable("unhandled case");
379 }
380 
381 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
382 #define RTLIBCASE_INT(LibcallPrefix)                                           \
383   do {                                                                         \
384     switch (Size) {                                                            \
385     case 32:                                                                   \
386       return RTLIB::LibcallPrefix##32;                                         \
387     case 64:                                                                   \
388       return RTLIB::LibcallPrefix##64;                                         \
389     case 128:                                                                  \
390       return RTLIB::LibcallPrefix##128;                                        \
391     default:                                                                   \
392       llvm_unreachable("unexpected size");                                     \
393     }                                                                          \
394   } while (0)
395 
396 #define RTLIBCASE(LibcallPrefix)                                               \
397   do {                                                                         \
398     switch (Size) {                                                            \
399     case 32:                                                                   \
400       return RTLIB::LibcallPrefix##32;                                         \
401     case 64:                                                                   \
402       return RTLIB::LibcallPrefix##64;                                         \
403     case 80:                                                                   \
404       return RTLIB::LibcallPrefix##80;                                         \
405     case 128:                                                                  \
406       return RTLIB::LibcallPrefix##128;                                        \
407     default:                                                                   \
408       llvm_unreachable("unexpected size");                                     \
409     }                                                                          \
410   } while (0)
411 
412   switch (Opcode) {
413   case TargetOpcode::G_MUL:
414     RTLIBCASE_INT(MUL_I);
415   case TargetOpcode::G_SDIV:
416     RTLIBCASE_INT(SDIV_I);
417   case TargetOpcode::G_UDIV:
418     RTLIBCASE_INT(UDIV_I);
419   case TargetOpcode::G_SREM:
420     RTLIBCASE_INT(SREM_I);
421   case TargetOpcode::G_UREM:
422     RTLIBCASE_INT(UREM_I);
423   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
424     RTLIBCASE_INT(CTLZ_I);
425   case TargetOpcode::G_FADD:
426     RTLIBCASE(ADD_F);
427   case TargetOpcode::G_FSUB:
428     RTLIBCASE(SUB_F);
429   case TargetOpcode::G_FMUL:
430     RTLIBCASE(MUL_F);
431   case TargetOpcode::G_FDIV:
432     RTLIBCASE(DIV_F);
433   case TargetOpcode::G_FEXP:
434     RTLIBCASE(EXP_F);
435   case TargetOpcode::G_FEXP2:
436     RTLIBCASE(EXP2_F);
437   case TargetOpcode::G_FEXP10:
438     RTLIBCASE(EXP10_F);
439   case TargetOpcode::G_FREM:
440     RTLIBCASE(REM_F);
441   case TargetOpcode::G_FPOW:
442     RTLIBCASE(POW_F);
443   case TargetOpcode::G_FPOWI:
444     RTLIBCASE(POWI_F);
445   case TargetOpcode::G_FMA:
446     RTLIBCASE(FMA_F);
447   case TargetOpcode::G_FSIN:
448     RTLIBCASE(SIN_F);
449   case TargetOpcode::G_FCOS:
450     RTLIBCASE(COS_F);
451   case TargetOpcode::G_FLOG10:
452     RTLIBCASE(LOG10_F);
453   case TargetOpcode::G_FLOG:
454     RTLIBCASE(LOG_F);
455   case TargetOpcode::G_FLOG2:
456     RTLIBCASE(LOG2_F);
457   case TargetOpcode::G_FLDEXP:
458     RTLIBCASE(LDEXP_F);
459   case TargetOpcode::G_FCEIL:
460     RTLIBCASE(CEIL_F);
461   case TargetOpcode::G_FFLOOR:
462     RTLIBCASE(FLOOR_F);
463   case TargetOpcode::G_FMINNUM:
464     RTLIBCASE(FMIN_F);
465   case TargetOpcode::G_FMAXNUM:
466     RTLIBCASE(FMAX_F);
467   case TargetOpcode::G_FSQRT:
468     RTLIBCASE(SQRT_F);
469   case TargetOpcode::G_FRINT:
470     RTLIBCASE(RINT_F);
471   case TargetOpcode::G_FNEARBYINT:
472     RTLIBCASE(NEARBYINT_F);
473   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
474     RTLIBCASE(ROUNDEVEN_F);
475   }
476   llvm_unreachable("Unknown libcall function");
477 }
478 
479 /// True if an instruction is in tail position in its caller. Intended for
480 /// legalizing libcalls as tail calls when possible.
481 static bool isLibCallInTailPosition(const CallLowering::ArgInfo &Result,
482                                     MachineInstr &MI,
483                                     const TargetInstrInfo &TII,
484                                     MachineRegisterInfo &MRI) {
485   MachineBasicBlock &MBB = *MI.getParent();
486   const Function &F = MBB.getParent()->getFunction();
487 
488   // Conservatively require the attributes of the call to match those of
489   // the return. Ignore NoAlias and NonNull because they don't affect the
490   // call sequence.
491   AttributeList CallerAttrs = F.getAttributes();
492   if (AttrBuilder(F.getContext(), CallerAttrs.getRetAttrs())
493           .removeAttribute(Attribute::NoAlias)
494           .removeAttribute(Attribute::NonNull)
495           .hasAttributes())
496     return false;
497 
498   // It's not safe to eliminate the sign / zero extension of the return value.
499   if (CallerAttrs.hasRetAttr(Attribute::ZExt) ||
500       CallerAttrs.hasRetAttr(Attribute::SExt))
501     return false;
502 
503   // Only tail call if the following instruction is a standard return or if we
504   // have a `thisreturn` callee, and a sequence like:
505   //
506   //   G_MEMCPY %0, %1, %2
507   //   $x0 = COPY %0
508   //   RET_ReallyLR implicit $x0
509   auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
510   if (Next != MBB.instr_end() && Next->isCopy()) {
511     if (MI.getOpcode() == TargetOpcode::G_BZERO)
512       return false;
513 
514     // For MEMCPY/MOMMOVE/MEMSET these will be the first use (the dst), as the
515     // mempy/etc routines return the same parameter. For other it will be the
516     // returned value.
517     Register VReg = MI.getOperand(0).getReg();
518     if (!VReg.isVirtual() || VReg != Next->getOperand(1).getReg())
519       return false;
520 
521     Register PReg = Next->getOperand(0).getReg();
522     if (!PReg.isPhysical())
523       return false;
524 
525     auto Ret = next_nodbg(Next, MBB.instr_end());
526     if (Ret == MBB.instr_end() || !Ret->isReturn())
527       return false;
528 
529     if (Ret->getNumImplicitOperands() != 1)
530       return false;
531 
532     if (!Ret->getOperand(0).isReg() || PReg != Ret->getOperand(0).getReg())
533       return false;
534 
535     // Skip over the COPY that we just validated.
536     Next = Ret;
537   }
538 
539   if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
540     return false;
541 
542   return true;
543 }
544 
545 LegalizerHelper::LegalizeResult
546 llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
547                     const CallLowering::ArgInfo &Result,
548                     ArrayRef<CallLowering::ArgInfo> Args,
549                     const CallingConv::ID CC, LostDebugLocObserver &LocObserver,
550                     MachineInstr *MI) {
551   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
552 
553   CallLowering::CallLoweringInfo Info;
554   Info.CallConv = CC;
555   Info.Callee = MachineOperand::CreateES(Name);
556   Info.OrigRet = Result;
557   if (MI)
558     Info.IsTailCall =
559         (Result.Ty->isVoidTy() ||
560          Result.Ty == MIRBuilder.getMF().getFunction().getReturnType()) &&
561         isLibCallInTailPosition(Result, *MI, MIRBuilder.getTII(),
562                                 *MIRBuilder.getMRI());
563 
564   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
565   if (!CLI.lowerCall(MIRBuilder, Info))
566     return LegalizerHelper::UnableToLegalize;
567 
568   if (MI && Info.LoweredTailCall) {
569     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
570 
571     // Check debug locations before removing the return.
572     LocObserver.checkpoint(true);
573 
574     // We must have a return following the call (or debug insts) to get past
575     // isLibCallInTailPosition.
576     do {
577       MachineInstr *Next = MI->getNextNode();
578       assert(Next &&
579              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
580              "Expected instr following MI to be return or debug inst?");
581       // We lowered a tail call, so the call is now the return from the block.
582       // Delete the old return.
583       Next->eraseFromParent();
584     } while (MI->getNextNode());
585 
586     // We expect to lose the debug location from the return.
587     LocObserver.checkpoint(false);
588   }
589   return LegalizerHelper::Legalized;
590 }
591 
592 LegalizerHelper::LegalizeResult
593 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
594                     const CallLowering::ArgInfo &Result,
595                     ArrayRef<CallLowering::ArgInfo> Args,
596                     LostDebugLocObserver &LocObserver, MachineInstr *MI) {
597   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
598   const char *Name = TLI.getLibcallName(Libcall);
599   if (!Name)
600     return LegalizerHelper::UnableToLegalize;
601   const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
602   return createLibcall(MIRBuilder, Name, Result, Args, CC, LocObserver, MI);
603 }
604 
605 // Useful for libcalls where all operands have the same type.
606 static LegalizerHelper::LegalizeResult
607 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
608               Type *OpType, LostDebugLocObserver &LocObserver) {
609   auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
610 
611   // FIXME: What does the original arg index mean here?
612   SmallVector<CallLowering::ArgInfo, 3> Args;
613   for (const MachineOperand &MO : llvm::drop_begin(MI.operands()))
614     Args.push_back({MO.getReg(), OpType, 0});
615   return createLibcall(MIRBuilder, Libcall,
616                        {MI.getOperand(0).getReg(), OpType, 0}, Args,
617                        LocObserver, &MI);
618 }
619 
620 LegalizerHelper::LegalizeResult
621 llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
622                        MachineInstr &MI, LostDebugLocObserver &LocObserver) {
623   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
624 
625   SmallVector<CallLowering::ArgInfo, 3> Args;
626   // Add all the args, except for the last which is an imm denoting 'tail'.
627   for (unsigned i = 0; i < MI.getNumOperands() - 1; ++i) {
628     Register Reg = MI.getOperand(i).getReg();
629 
630     // Need derive an IR type for call lowering.
631     LLT OpLLT = MRI.getType(Reg);
632     Type *OpTy = nullptr;
633     if (OpLLT.isPointer())
634       OpTy = PointerType::get(Ctx, OpLLT.getAddressSpace());
635     else
636       OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
637     Args.push_back({Reg, OpTy, 0});
638   }
639 
640   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
641   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
642   RTLIB::Libcall RTLibcall;
643   unsigned Opc = MI.getOpcode();
644   switch (Opc) {
645   case TargetOpcode::G_BZERO:
646     RTLibcall = RTLIB::BZERO;
647     break;
648   case TargetOpcode::G_MEMCPY:
649     RTLibcall = RTLIB::MEMCPY;
650     Args[0].Flags[0].setReturned();
651     break;
652   case TargetOpcode::G_MEMMOVE:
653     RTLibcall = RTLIB::MEMMOVE;
654     Args[0].Flags[0].setReturned();
655     break;
656   case TargetOpcode::G_MEMSET:
657     RTLibcall = RTLIB::MEMSET;
658     Args[0].Flags[0].setReturned();
659     break;
660   default:
661     llvm_unreachable("unsupported opcode");
662   }
663   const char *Name = TLI.getLibcallName(RTLibcall);
664 
665   // Unsupported libcall on the target.
666   if (!Name) {
667     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
668                       << MIRBuilder.getTII().getName(Opc) << "\n");
669     return LegalizerHelper::UnableToLegalize;
670   }
671 
672   CallLowering::CallLoweringInfo Info;
673   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
674   Info.Callee = MachineOperand::CreateES(Name);
675   Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0);
676   Info.IsTailCall =
677       MI.getOperand(MI.getNumOperands() - 1).getImm() &&
678       isLibCallInTailPosition(Info.OrigRet, MI, MIRBuilder.getTII(), MRI);
679 
680   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
681   if (!CLI.lowerCall(MIRBuilder, Info))
682     return LegalizerHelper::UnableToLegalize;
683 
684   if (Info.LoweredTailCall) {
685     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
686 
687     // Check debug locations before removing the return.
688     LocObserver.checkpoint(true);
689 
690     // We must have a return following the call (or debug insts) to get past
691     // isLibCallInTailPosition.
692     do {
693       MachineInstr *Next = MI.getNextNode();
694       assert(Next &&
695              (Next->isCopy() || Next->isReturn() || Next->isDebugInstr()) &&
696              "Expected instr following MI to be return or debug inst?");
697       // We lowered a tail call, so the call is now the return from the block.
698       // Delete the old return.
699       Next->eraseFromParent();
700     } while (MI.getNextNode());
701 
702     // We expect to lose the debug location from the return.
703     LocObserver.checkpoint(false);
704   }
705 
706   return LegalizerHelper::Legalized;
707 }
708 
709 static RTLIB::Libcall getOutlineAtomicLibcall(MachineInstr &MI) {
710   unsigned Opc = MI.getOpcode();
711   auto &AtomicMI = cast<GMemOperation>(MI);
712   auto &MMO = AtomicMI.getMMO();
713   auto Ordering = MMO.getMergedOrdering();
714   LLT MemType = MMO.getMemoryType();
715   uint64_t MemSize = MemType.getSizeInBytes();
716   if (MemType.isVector())
717     return RTLIB::UNKNOWN_LIBCALL;
718 
719 #define LCALLS(A, B)                                                           \
720   { A##B##_RELAX, A##B##_ACQ, A##B##_REL, A##B##_ACQ_REL }
721 #define LCALL5(A)                                                              \
722   LCALLS(A, 1), LCALLS(A, 2), LCALLS(A, 4), LCALLS(A, 8), LCALLS(A, 16)
723   switch (Opc) {
724   case TargetOpcode::G_ATOMIC_CMPXCHG:
725   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
726     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_CAS)};
727     return getOutlineAtomicHelper(LC, Ordering, MemSize);
728   }
729   case TargetOpcode::G_ATOMICRMW_XCHG: {
730     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_SWP)};
731     return getOutlineAtomicHelper(LC, Ordering, MemSize);
732   }
733   case TargetOpcode::G_ATOMICRMW_ADD:
734   case TargetOpcode::G_ATOMICRMW_SUB: {
735     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDADD)};
736     return getOutlineAtomicHelper(LC, Ordering, MemSize);
737   }
738   case TargetOpcode::G_ATOMICRMW_AND: {
739     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDCLR)};
740     return getOutlineAtomicHelper(LC, Ordering, MemSize);
741   }
742   case TargetOpcode::G_ATOMICRMW_OR: {
743     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDSET)};
744     return getOutlineAtomicHelper(LC, Ordering, MemSize);
745   }
746   case TargetOpcode::G_ATOMICRMW_XOR: {
747     const RTLIB::Libcall LC[5][4] = {LCALL5(RTLIB::OUTLINE_ATOMIC_LDEOR)};
748     return getOutlineAtomicHelper(LC, Ordering, MemSize);
749   }
750   default:
751     return RTLIB::UNKNOWN_LIBCALL;
752   }
753 #undef LCALLS
754 #undef LCALL5
755 }
756 
757 static LegalizerHelper::LegalizeResult
758 createAtomicLibcall(MachineIRBuilder &MIRBuilder, MachineInstr &MI) {
759   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
760 
761   Type *RetTy;
762   SmallVector<Register> RetRegs;
763   SmallVector<CallLowering::ArgInfo, 3> Args;
764   unsigned Opc = MI.getOpcode();
765   switch (Opc) {
766   case TargetOpcode::G_ATOMIC_CMPXCHG:
767   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
768     Register Success;
769     LLT SuccessLLT;
770     auto [Ret, RetLLT, Mem, MemLLT, Cmp, CmpLLT, New, NewLLT] =
771         MI.getFirst4RegLLTs();
772     RetRegs.push_back(Ret);
773     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
774     if (Opc == TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
775       std::tie(Ret, RetLLT, Success, SuccessLLT, Mem, MemLLT, Cmp, CmpLLT, New,
776                NewLLT) = MI.getFirst5RegLLTs();
777       RetRegs.push_back(Success);
778       RetTy = StructType::get(
779           Ctx, {RetTy, IntegerType::get(Ctx, SuccessLLT.getSizeInBits())});
780     }
781     Args.push_back({Cmp, IntegerType::get(Ctx, CmpLLT.getSizeInBits()), 0});
782     Args.push_back({New, IntegerType::get(Ctx, NewLLT.getSizeInBits()), 0});
783     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
784     break;
785   }
786   case TargetOpcode::G_ATOMICRMW_XCHG:
787   case TargetOpcode::G_ATOMICRMW_ADD:
788   case TargetOpcode::G_ATOMICRMW_SUB:
789   case TargetOpcode::G_ATOMICRMW_AND:
790   case TargetOpcode::G_ATOMICRMW_OR:
791   case TargetOpcode::G_ATOMICRMW_XOR: {
792     auto [Ret, RetLLT, Mem, MemLLT, Val, ValLLT] = MI.getFirst3RegLLTs();
793     RetRegs.push_back(Ret);
794     RetTy = IntegerType::get(Ctx, RetLLT.getSizeInBits());
795     if (Opc == TargetOpcode::G_ATOMICRMW_AND)
796       Val =
797           MIRBuilder.buildXor(ValLLT, MIRBuilder.buildConstant(ValLLT, -1), Val)
798               .getReg(0);
799     else if (Opc == TargetOpcode::G_ATOMICRMW_SUB)
800       Val =
801           MIRBuilder.buildSub(ValLLT, MIRBuilder.buildConstant(ValLLT, 0), Val)
802               .getReg(0);
803     Args.push_back({Val, IntegerType::get(Ctx, ValLLT.getSizeInBits()), 0});
804     Args.push_back({Mem, PointerType::get(Ctx, MemLLT.getAddressSpace()), 0});
805     break;
806   }
807   default:
808     llvm_unreachable("unsupported opcode");
809   }
810 
811   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
812   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
813   RTLIB::Libcall RTLibcall = getOutlineAtomicLibcall(MI);
814   const char *Name = TLI.getLibcallName(RTLibcall);
815 
816   // Unsupported libcall on the target.
817   if (!Name) {
818     LLVM_DEBUG(dbgs() << ".. .. Could not find libcall name for "
819                       << MIRBuilder.getTII().getName(Opc) << "\n");
820     return LegalizerHelper::UnableToLegalize;
821   }
822 
823   CallLowering::CallLoweringInfo Info;
824   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
825   Info.Callee = MachineOperand::CreateES(Name);
826   Info.OrigRet = CallLowering::ArgInfo(RetRegs, RetTy, 0);
827 
828   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
829   if (!CLI.lowerCall(MIRBuilder, Info))
830     return LegalizerHelper::UnableToLegalize;
831 
832   return LegalizerHelper::Legalized;
833 }
834 
835 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
836                                        Type *FromType) {
837   auto ToMVT = MVT::getVT(ToType);
838   auto FromMVT = MVT::getVT(FromType);
839 
840   switch (Opcode) {
841   case TargetOpcode::G_FPEXT:
842     return RTLIB::getFPEXT(FromMVT, ToMVT);
843   case TargetOpcode::G_FPTRUNC:
844     return RTLIB::getFPROUND(FromMVT, ToMVT);
845   case TargetOpcode::G_FPTOSI:
846     return RTLIB::getFPTOSINT(FromMVT, ToMVT);
847   case TargetOpcode::G_FPTOUI:
848     return RTLIB::getFPTOUINT(FromMVT, ToMVT);
849   case TargetOpcode::G_SITOFP:
850     return RTLIB::getSINTTOFP(FromMVT, ToMVT);
851   case TargetOpcode::G_UITOFP:
852     return RTLIB::getUINTTOFP(FromMVT, ToMVT);
853   }
854   llvm_unreachable("Unsupported libcall function");
855 }
856 
857 static LegalizerHelper::LegalizeResult
858 conversionLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, Type *ToType,
859                   Type *FromType, LostDebugLocObserver &LocObserver) {
860   RTLIB::Libcall Libcall = getConvRTLibDesc(MI.getOpcode(), ToType, FromType);
861   return createLibcall(
862       MIRBuilder, Libcall, {MI.getOperand(0).getReg(), ToType, 0},
863       {{MI.getOperand(1).getReg(), FromType, 0}}, LocObserver, &MI);
864 }
865 
866 static RTLIB::Libcall
867 getStateLibraryFunctionFor(MachineInstr &MI, const TargetLowering &TLI) {
868   RTLIB::Libcall RTLibcall;
869   switch (MI.getOpcode()) {
870   case TargetOpcode::G_GET_FPENV:
871     RTLibcall = RTLIB::FEGETENV;
872     break;
873   case TargetOpcode::G_SET_FPENV:
874   case TargetOpcode::G_RESET_FPENV:
875     RTLibcall = RTLIB::FESETENV;
876     break;
877   case TargetOpcode::G_GET_FPMODE:
878     RTLibcall = RTLIB::FEGETMODE;
879     break;
880   case TargetOpcode::G_SET_FPMODE:
881   case TargetOpcode::G_RESET_FPMODE:
882     RTLibcall = RTLIB::FESETMODE;
883     break;
884   default:
885     llvm_unreachable("Unexpected opcode");
886   }
887   return RTLibcall;
888 }
889 
890 // Some library functions that read FP state (fegetmode, fegetenv) write the
891 // state into a region in memory. IR intrinsics that do the same operations
892 // (get_fpmode, get_fpenv) return the state as integer value. To implement these
893 // intrinsics via the library functions, we need to use temporary variable,
894 // for example:
895 //
896 //     %0:_(s32) = G_GET_FPMODE
897 //
898 // is transformed to:
899 //
900 //     %1:_(p0) = G_FRAME_INDEX %stack.0
901 //     BL &fegetmode
902 //     %0:_(s32) = G_LOAD % 1
903 //
904 LegalizerHelper::LegalizeResult
905 LegalizerHelper::createGetStateLibcall(MachineIRBuilder &MIRBuilder,
906                                        MachineInstr &MI,
907                                        LostDebugLocObserver &LocObserver) {
908   const DataLayout &DL = MIRBuilder.getDataLayout();
909   auto &MF = MIRBuilder.getMF();
910   auto &MRI = *MIRBuilder.getMRI();
911   auto &Ctx = MF.getFunction().getContext();
912 
913   // Create temporary, where library function will put the read state.
914   Register Dst = MI.getOperand(0).getReg();
915   LLT StateTy = MRI.getType(Dst);
916   TypeSize StateSize = StateTy.getSizeInBytes();
917   Align TempAlign = getStackTemporaryAlignment(StateTy);
918   MachinePointerInfo TempPtrInfo;
919   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
920 
921   // Create a call to library function, with the temporary as an argument.
922   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
923   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
924   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
925   auto Res =
926       createLibcall(MIRBuilder, RTLibcall,
927                     CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
928                     CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
929                     LocObserver, nullptr);
930   if (Res != LegalizerHelper::Legalized)
931     return Res;
932 
933   // Create a load from the temporary.
934   MachineMemOperand *MMO = MF.getMachineMemOperand(
935       TempPtrInfo, MachineMemOperand::MOLoad, StateTy, TempAlign);
936   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Temp, *MMO);
937 
938   return LegalizerHelper::Legalized;
939 }
940 
941 // Similar to `createGetStateLibcall` the function calls a library function
942 // using transient space in stack. In this case the library function reads
943 // content of memory region.
944 LegalizerHelper::LegalizeResult
945 LegalizerHelper::createSetStateLibcall(MachineIRBuilder &MIRBuilder,
946                                        MachineInstr &MI,
947                                        LostDebugLocObserver &LocObserver) {
948   const DataLayout &DL = MIRBuilder.getDataLayout();
949   auto &MF = MIRBuilder.getMF();
950   auto &MRI = *MIRBuilder.getMRI();
951   auto &Ctx = MF.getFunction().getContext();
952 
953   // Create temporary, where library function will get the new state.
954   Register Src = MI.getOperand(0).getReg();
955   LLT StateTy = MRI.getType(Src);
956   TypeSize StateSize = StateTy.getSizeInBytes();
957   Align TempAlign = getStackTemporaryAlignment(StateTy);
958   MachinePointerInfo TempPtrInfo;
959   auto Temp = createStackTemporary(StateSize, TempAlign, TempPtrInfo);
960 
961   // Put the new state into the temporary.
962   MachineMemOperand *MMO = MF.getMachineMemOperand(
963       TempPtrInfo, MachineMemOperand::MOStore, StateTy, TempAlign);
964   MIRBuilder.buildStore(Src, Temp, *MMO);
965 
966   // Create a call to library function, with the temporary as an argument.
967   unsigned TempAddrSpace = DL.getAllocaAddrSpace();
968   Type *StatePtrTy = PointerType::get(Ctx, TempAddrSpace);
969   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
970   return createLibcall(MIRBuilder, RTLibcall,
971                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
972                        CallLowering::ArgInfo({Temp.getReg(0), StatePtrTy, 0}),
973                        LocObserver, nullptr);
974 }
975 
976 // The function is used to legalize operations that set default environment
977 // state. In C library a call like `fesetmode(FE_DFL_MODE)` is used for that.
978 // On most targets supported in glibc FE_DFL_MODE is defined as
979 // `((const femode_t *) -1)`. Such assumption is used here. If for some target
980 // it is not true, the target must provide custom lowering.
981 LegalizerHelper::LegalizeResult
982 LegalizerHelper::createResetStateLibcall(MachineIRBuilder &MIRBuilder,
983                                          MachineInstr &MI,
984                                          LostDebugLocObserver &LocObserver) {
985   const DataLayout &DL = MIRBuilder.getDataLayout();
986   auto &MF = MIRBuilder.getMF();
987   auto &Ctx = MF.getFunction().getContext();
988 
989   // Create an argument for the library function.
990   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
991   Type *StatePtrTy = PointerType::get(Ctx, AddrSpace);
992   unsigned PtrSize = DL.getPointerSizeInBits(AddrSpace);
993   LLT MemTy = LLT::pointer(AddrSpace, PtrSize);
994   auto DefValue = MIRBuilder.buildConstant(LLT::scalar(PtrSize), -1LL);
995   DstOp Dest(MRI.createGenericVirtualRegister(MemTy));
996   MIRBuilder.buildIntToPtr(Dest, DefValue);
997 
998   RTLIB::Libcall RTLibcall = getStateLibraryFunctionFor(MI, TLI);
999   return createLibcall(MIRBuilder, RTLibcall,
1000                        CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx), 0),
1001                        CallLowering::ArgInfo({Dest.getReg(), StatePtrTy, 0}),
1002                        LocObserver, &MI);
1003 }
1004 
1005 LegalizerHelper::LegalizeResult
1006 LegalizerHelper::libcall(MachineInstr &MI, LostDebugLocObserver &LocObserver) {
1007   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
1008 
1009   switch (MI.getOpcode()) {
1010   default:
1011     return UnableToLegalize;
1012   case TargetOpcode::G_MUL:
1013   case TargetOpcode::G_SDIV:
1014   case TargetOpcode::G_UDIV:
1015   case TargetOpcode::G_SREM:
1016   case TargetOpcode::G_UREM:
1017   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
1018     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1019     unsigned Size = LLTy.getSizeInBits();
1020     Type *HLTy = IntegerType::get(Ctx, Size);
1021     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1022     if (Status != Legalized)
1023       return Status;
1024     break;
1025   }
1026   case TargetOpcode::G_FADD:
1027   case TargetOpcode::G_FSUB:
1028   case TargetOpcode::G_FMUL:
1029   case TargetOpcode::G_FDIV:
1030   case TargetOpcode::G_FMA:
1031   case TargetOpcode::G_FPOW:
1032   case TargetOpcode::G_FREM:
1033   case TargetOpcode::G_FCOS:
1034   case TargetOpcode::G_FSIN:
1035   case TargetOpcode::G_FLOG10:
1036   case TargetOpcode::G_FLOG:
1037   case TargetOpcode::G_FLOG2:
1038   case TargetOpcode::G_FLDEXP:
1039   case TargetOpcode::G_FEXP:
1040   case TargetOpcode::G_FEXP2:
1041   case TargetOpcode::G_FEXP10:
1042   case TargetOpcode::G_FCEIL:
1043   case TargetOpcode::G_FFLOOR:
1044   case TargetOpcode::G_FMINNUM:
1045   case TargetOpcode::G_FMAXNUM:
1046   case TargetOpcode::G_FSQRT:
1047   case TargetOpcode::G_FRINT:
1048   case TargetOpcode::G_FNEARBYINT:
1049   case TargetOpcode::G_INTRINSIC_ROUNDEVEN: {
1050     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1051     unsigned Size = LLTy.getSizeInBits();
1052     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1053     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1054       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1055       return UnableToLegalize;
1056     }
1057     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy, LocObserver);
1058     if (Status != Legalized)
1059       return Status;
1060     break;
1061   }
1062   case TargetOpcode::G_FPOWI: {
1063     LLT LLTy = MRI.getType(MI.getOperand(0).getReg());
1064     unsigned Size = LLTy.getSizeInBits();
1065     Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
1066     Type *ITy = IntegerType::get(
1067         Ctx, MRI.getType(MI.getOperand(2).getReg()).getSizeInBits());
1068     if (!HLTy || (Size != 32 && Size != 64 && Size != 80 && Size != 128)) {
1069       LLVM_DEBUG(dbgs() << "No libcall available for type " << LLTy << ".\n");
1070       return UnableToLegalize;
1071     }
1072     auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
1073     std::initializer_list<CallLowering::ArgInfo> Args = {
1074         {MI.getOperand(1).getReg(), HLTy, 0},
1075         {MI.getOperand(2).getReg(), ITy, 1}};
1076     LegalizeResult Status =
1077         createLibcall(MIRBuilder, Libcall, {MI.getOperand(0).getReg(), HLTy, 0},
1078                       Args, LocObserver, &MI);
1079     if (Status != Legalized)
1080       return Status;
1081     break;
1082   }
1083   case TargetOpcode::G_FPEXT:
1084   case TargetOpcode::G_FPTRUNC: {
1085     Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
1086     Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
1087     if (!FromTy || !ToTy)
1088       return UnableToLegalize;
1089     LegalizeResult Status =
1090         conversionLibcall(MI, MIRBuilder, ToTy, FromTy, LocObserver);
1091     if (Status != Legalized)
1092       return Status;
1093     break;
1094   }
1095   case TargetOpcode::G_FPTOSI:
1096   case TargetOpcode::G_FPTOUI: {
1097     // FIXME: Support other types
1098     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1099     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1100     if ((ToSize != 32 && ToSize != 64) || (FromSize != 32 && FromSize != 64))
1101       return UnableToLegalize;
1102     LegalizeResult Status = conversionLibcall(
1103         MI, MIRBuilder,
1104         ToSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1105         FromSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1106         LocObserver);
1107     if (Status != Legalized)
1108       return Status;
1109     break;
1110   }
1111   case TargetOpcode::G_SITOFP:
1112   case TargetOpcode::G_UITOFP: {
1113     // FIXME: Support other types
1114     unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1115     unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1116     if ((FromSize != 32 && FromSize != 64) || (ToSize != 32 && ToSize != 64))
1117       return UnableToLegalize;
1118     LegalizeResult Status = conversionLibcall(
1119         MI, MIRBuilder,
1120         ToSize == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx),
1121         FromSize == 32 ? Type::getInt32Ty(Ctx) : Type::getInt64Ty(Ctx),
1122         LocObserver);
1123     if (Status != Legalized)
1124       return Status;
1125     break;
1126   }
1127   case TargetOpcode::G_ATOMICRMW_XCHG:
1128   case TargetOpcode::G_ATOMICRMW_ADD:
1129   case TargetOpcode::G_ATOMICRMW_SUB:
1130   case TargetOpcode::G_ATOMICRMW_AND:
1131   case TargetOpcode::G_ATOMICRMW_OR:
1132   case TargetOpcode::G_ATOMICRMW_XOR:
1133   case TargetOpcode::G_ATOMIC_CMPXCHG:
1134   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
1135     auto Status = createAtomicLibcall(MIRBuilder, MI);
1136     if (Status != Legalized)
1137       return Status;
1138     break;
1139   }
1140   case TargetOpcode::G_BZERO:
1141   case TargetOpcode::G_MEMCPY:
1142   case TargetOpcode::G_MEMMOVE:
1143   case TargetOpcode::G_MEMSET: {
1144     LegalizeResult Result =
1145         createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI, LocObserver);
1146     if (Result != Legalized)
1147       return Result;
1148     MI.eraseFromParent();
1149     return Result;
1150   }
1151   case TargetOpcode::G_GET_FPENV:
1152   case TargetOpcode::G_GET_FPMODE: {
1153     LegalizeResult Result = createGetStateLibcall(MIRBuilder, MI, LocObserver);
1154     if (Result != Legalized)
1155       return Result;
1156     break;
1157   }
1158   case TargetOpcode::G_SET_FPENV:
1159   case TargetOpcode::G_SET_FPMODE: {
1160     LegalizeResult Result = createSetStateLibcall(MIRBuilder, MI, LocObserver);
1161     if (Result != Legalized)
1162       return Result;
1163     break;
1164   }
1165   case TargetOpcode::G_RESET_FPENV:
1166   case TargetOpcode::G_RESET_FPMODE: {
1167     LegalizeResult Result =
1168         createResetStateLibcall(MIRBuilder, MI, LocObserver);
1169     if (Result != Legalized)
1170       return Result;
1171     break;
1172   }
1173   }
1174 
1175   MI.eraseFromParent();
1176   return Legalized;
1177 }
1178 
1179 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
1180                                                               unsigned TypeIdx,
1181                                                               LLT NarrowTy) {
1182   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
1183   uint64_t NarrowSize = NarrowTy.getSizeInBits();
1184 
1185   switch (MI.getOpcode()) {
1186   default:
1187     return UnableToLegalize;
1188   case TargetOpcode::G_IMPLICIT_DEF: {
1189     Register DstReg = MI.getOperand(0).getReg();
1190     LLT DstTy = MRI.getType(DstReg);
1191 
1192     // If SizeOp0 is not an exact multiple of NarrowSize, emit
1193     // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
1194     // FIXME: Although this would also be legal for the general case, it causes
1195     //  a lot of regressions in the emitted code (superfluous COPYs, artifact
1196     //  combines not being hit). This seems to be a problem related to the
1197     //  artifact combiner.
1198     if (SizeOp0 % NarrowSize != 0) {
1199       LLT ImplicitTy = NarrowTy;
1200       if (DstTy.isVector())
1201         ImplicitTy = LLT::vector(DstTy.getElementCount(), ImplicitTy);
1202 
1203       Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
1204       MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
1205 
1206       MI.eraseFromParent();
1207       return Legalized;
1208     }
1209 
1210     int NumParts = SizeOp0 / NarrowSize;
1211 
1212     SmallVector<Register, 2> DstRegs;
1213     for (int i = 0; i < NumParts; ++i)
1214       DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
1215 
1216     if (DstTy.isVector())
1217       MIRBuilder.buildBuildVector(DstReg, DstRegs);
1218     else
1219       MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1220     MI.eraseFromParent();
1221     return Legalized;
1222   }
1223   case TargetOpcode::G_CONSTANT: {
1224     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1225     const APInt &Val = MI.getOperand(1).getCImm()->getValue();
1226     unsigned TotalSize = Ty.getSizeInBits();
1227     unsigned NarrowSize = NarrowTy.getSizeInBits();
1228     int NumParts = TotalSize / NarrowSize;
1229 
1230     SmallVector<Register, 4> PartRegs;
1231     for (int I = 0; I != NumParts; ++I) {
1232       unsigned Offset = I * NarrowSize;
1233       auto K = MIRBuilder.buildConstant(NarrowTy,
1234                                         Val.lshr(Offset).trunc(NarrowSize));
1235       PartRegs.push_back(K.getReg(0));
1236     }
1237 
1238     LLT LeftoverTy;
1239     unsigned LeftoverBits = TotalSize - NumParts * NarrowSize;
1240     SmallVector<Register, 1> LeftoverRegs;
1241     if (LeftoverBits != 0) {
1242       LeftoverTy = LLT::scalar(LeftoverBits);
1243       auto K = MIRBuilder.buildConstant(
1244         LeftoverTy,
1245         Val.lshr(NumParts * NarrowSize).trunc(LeftoverBits));
1246       LeftoverRegs.push_back(K.getReg(0));
1247     }
1248 
1249     insertParts(MI.getOperand(0).getReg(),
1250                 Ty, NarrowTy, PartRegs, LeftoverTy, LeftoverRegs);
1251 
1252     MI.eraseFromParent();
1253     return Legalized;
1254   }
1255   case TargetOpcode::G_SEXT:
1256   case TargetOpcode::G_ZEXT:
1257   case TargetOpcode::G_ANYEXT:
1258     return narrowScalarExt(MI, TypeIdx, NarrowTy);
1259   case TargetOpcode::G_TRUNC: {
1260     if (TypeIdx != 1)
1261       return UnableToLegalize;
1262 
1263     uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
1264     if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
1265       LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
1266       return UnableToLegalize;
1267     }
1268 
1269     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
1270     MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
1271     MI.eraseFromParent();
1272     return Legalized;
1273   }
1274 
1275   case TargetOpcode::G_FREEZE: {
1276     if (TypeIdx != 0)
1277       return UnableToLegalize;
1278 
1279     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
1280     // Should widen scalar first
1281     if (Ty.getSizeInBits() % NarrowTy.getSizeInBits() != 0)
1282       return UnableToLegalize;
1283 
1284     auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
1285     SmallVector<Register, 8> Parts;
1286     for (unsigned i = 0; i < Unmerge->getNumDefs(); ++i) {
1287       Parts.push_back(
1288           MIRBuilder.buildFreeze(NarrowTy, Unmerge.getReg(i)).getReg(0));
1289     }
1290 
1291     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0).getReg(), Parts);
1292     MI.eraseFromParent();
1293     return Legalized;
1294   }
1295   case TargetOpcode::G_ADD:
1296   case TargetOpcode::G_SUB:
1297   case TargetOpcode::G_SADDO:
1298   case TargetOpcode::G_SSUBO:
1299   case TargetOpcode::G_SADDE:
1300   case TargetOpcode::G_SSUBE:
1301   case TargetOpcode::G_UADDO:
1302   case TargetOpcode::G_USUBO:
1303   case TargetOpcode::G_UADDE:
1304   case TargetOpcode::G_USUBE:
1305     return narrowScalarAddSub(MI, TypeIdx, NarrowTy);
1306   case TargetOpcode::G_MUL:
1307   case TargetOpcode::G_UMULH:
1308     return narrowScalarMul(MI, NarrowTy);
1309   case TargetOpcode::G_EXTRACT:
1310     return narrowScalarExtract(MI, TypeIdx, NarrowTy);
1311   case TargetOpcode::G_INSERT:
1312     return narrowScalarInsert(MI, TypeIdx, NarrowTy);
1313   case TargetOpcode::G_LOAD: {
1314     auto &LoadMI = cast<GLoad>(MI);
1315     Register DstReg = LoadMI.getDstReg();
1316     LLT DstTy = MRI.getType(DstReg);
1317     if (DstTy.isVector())
1318       return UnableToLegalize;
1319 
1320     if (8 * LoadMI.getMemSize() != DstTy.getSizeInBits()) {
1321       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1322       MIRBuilder.buildLoad(TmpReg, LoadMI.getPointerReg(), LoadMI.getMMO());
1323       MIRBuilder.buildAnyExt(DstReg, TmpReg);
1324       LoadMI.eraseFromParent();
1325       return Legalized;
1326     }
1327 
1328     return reduceLoadStoreWidth(LoadMI, TypeIdx, NarrowTy);
1329   }
1330   case TargetOpcode::G_ZEXTLOAD:
1331   case TargetOpcode::G_SEXTLOAD: {
1332     auto &LoadMI = cast<GExtLoad>(MI);
1333     Register DstReg = LoadMI.getDstReg();
1334     Register PtrReg = LoadMI.getPointerReg();
1335 
1336     Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1337     auto &MMO = LoadMI.getMMO();
1338     unsigned MemSize = MMO.getSizeInBits();
1339 
1340     if (MemSize == NarrowSize) {
1341       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
1342     } else if (MemSize < NarrowSize) {
1343       MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), TmpReg, PtrReg, MMO);
1344     } else if (MemSize > NarrowSize) {
1345       // FIXME: Need to split the load.
1346       return UnableToLegalize;
1347     }
1348 
1349     if (isa<GZExtLoad>(LoadMI))
1350       MIRBuilder.buildZExt(DstReg, TmpReg);
1351     else
1352       MIRBuilder.buildSExt(DstReg, TmpReg);
1353 
1354     LoadMI.eraseFromParent();
1355     return Legalized;
1356   }
1357   case TargetOpcode::G_STORE: {
1358     auto &StoreMI = cast<GStore>(MI);
1359 
1360     Register SrcReg = StoreMI.getValueReg();
1361     LLT SrcTy = MRI.getType(SrcReg);
1362     if (SrcTy.isVector())
1363       return UnableToLegalize;
1364 
1365     int NumParts = SizeOp0 / NarrowSize;
1366     unsigned HandledSize = NumParts * NarrowTy.getSizeInBits();
1367     unsigned LeftoverBits = SrcTy.getSizeInBits() - HandledSize;
1368     if (SrcTy.isVector() && LeftoverBits != 0)
1369       return UnableToLegalize;
1370 
1371     if (8 * StoreMI.getMemSize() != SrcTy.getSizeInBits()) {
1372       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
1373       MIRBuilder.buildTrunc(TmpReg, SrcReg);
1374       MIRBuilder.buildStore(TmpReg, StoreMI.getPointerReg(), StoreMI.getMMO());
1375       StoreMI.eraseFromParent();
1376       return Legalized;
1377     }
1378 
1379     return reduceLoadStoreWidth(StoreMI, 0, NarrowTy);
1380   }
1381   case TargetOpcode::G_SELECT:
1382     return narrowScalarSelect(MI, TypeIdx, NarrowTy);
1383   case TargetOpcode::G_AND:
1384   case TargetOpcode::G_OR:
1385   case TargetOpcode::G_XOR: {
1386     // Legalize bitwise operation:
1387     // A = BinOp<Ty> B, C
1388     // into:
1389     // B1, ..., BN = G_UNMERGE_VALUES B
1390     // C1, ..., CN = G_UNMERGE_VALUES C
1391     // A1 = BinOp<Ty/N> B1, C2
1392     // ...
1393     // AN = BinOp<Ty/N> BN, CN
1394     // A = G_MERGE_VALUES A1, ..., AN
1395     return narrowScalarBasic(MI, TypeIdx, NarrowTy);
1396   }
1397   case TargetOpcode::G_SHL:
1398   case TargetOpcode::G_LSHR:
1399   case TargetOpcode::G_ASHR:
1400     return narrowScalarShift(MI, TypeIdx, NarrowTy);
1401   case TargetOpcode::G_CTLZ:
1402   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1403   case TargetOpcode::G_CTTZ:
1404   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1405   case TargetOpcode::G_CTPOP:
1406     if (TypeIdx == 1)
1407       switch (MI.getOpcode()) {
1408       case TargetOpcode::G_CTLZ:
1409       case TargetOpcode::G_CTLZ_ZERO_UNDEF:
1410         return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
1411       case TargetOpcode::G_CTTZ:
1412       case TargetOpcode::G_CTTZ_ZERO_UNDEF:
1413         return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
1414       case TargetOpcode::G_CTPOP:
1415         return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
1416       default:
1417         return UnableToLegalize;
1418       }
1419 
1420     Observer.changingInstr(MI);
1421     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1422     Observer.changedInstr(MI);
1423     return Legalized;
1424   case TargetOpcode::G_INTTOPTR:
1425     if (TypeIdx != 1)
1426       return UnableToLegalize;
1427 
1428     Observer.changingInstr(MI);
1429     narrowScalarSrc(MI, NarrowTy, 1);
1430     Observer.changedInstr(MI);
1431     return Legalized;
1432   case TargetOpcode::G_PTRTOINT:
1433     if (TypeIdx != 0)
1434       return UnableToLegalize;
1435 
1436     Observer.changingInstr(MI);
1437     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
1438     Observer.changedInstr(MI);
1439     return Legalized;
1440   case TargetOpcode::G_PHI: {
1441     // FIXME: add support for when SizeOp0 isn't an exact multiple of
1442     // NarrowSize.
1443     if (SizeOp0 % NarrowSize != 0)
1444       return UnableToLegalize;
1445 
1446     unsigned NumParts = SizeOp0 / NarrowSize;
1447     SmallVector<Register, 2> DstRegs(NumParts);
1448     SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
1449     Observer.changingInstr(MI);
1450     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
1451       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
1452       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
1453       extractParts(MI.getOperand(i).getReg(), NarrowTy, NumParts,
1454                    SrcRegs[i / 2], MIRBuilder, MRI);
1455     }
1456     MachineBasicBlock &MBB = *MI.getParent();
1457     MIRBuilder.setInsertPt(MBB, MI);
1458     for (unsigned i = 0; i < NumParts; ++i) {
1459       DstRegs[i] = MRI.createGenericVirtualRegister(NarrowTy);
1460       MachineInstrBuilder MIB =
1461           MIRBuilder.buildInstr(TargetOpcode::G_PHI).addDef(DstRegs[i]);
1462       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
1463         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
1464     }
1465     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
1466     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1467     Observer.changedInstr(MI);
1468     MI.eraseFromParent();
1469     return Legalized;
1470   }
1471   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
1472   case TargetOpcode::G_INSERT_VECTOR_ELT: {
1473     if (TypeIdx != 2)
1474       return UnableToLegalize;
1475 
1476     int OpIdx = MI.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT ? 2 : 3;
1477     Observer.changingInstr(MI);
1478     narrowScalarSrc(MI, NarrowTy, OpIdx);
1479     Observer.changedInstr(MI);
1480     return Legalized;
1481   }
1482   case TargetOpcode::G_ICMP: {
1483     Register LHS = MI.getOperand(2).getReg();
1484     LLT SrcTy = MRI.getType(LHS);
1485     uint64_t SrcSize = SrcTy.getSizeInBits();
1486     CmpInst::Predicate Pred =
1487         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
1488 
1489     // TODO: Handle the non-equality case for weird sizes.
1490     if (NarrowSize * 2 != SrcSize && !ICmpInst::isEquality(Pred))
1491       return UnableToLegalize;
1492 
1493     LLT LeftoverTy; // Example: s88 -> s64 (NarrowTy) + s24 (leftover)
1494     SmallVector<Register, 4> LHSPartRegs, LHSLeftoverRegs;
1495     if (!extractParts(LHS, SrcTy, NarrowTy, LeftoverTy, LHSPartRegs,
1496                       LHSLeftoverRegs, MIRBuilder, MRI))
1497       return UnableToLegalize;
1498 
1499     LLT Unused; // Matches LeftoverTy; G_ICMP LHS and RHS are the same type.
1500     SmallVector<Register, 4> RHSPartRegs, RHSLeftoverRegs;
1501     if (!extractParts(MI.getOperand(3).getReg(), SrcTy, NarrowTy, Unused,
1502                       RHSPartRegs, RHSLeftoverRegs, MIRBuilder, MRI))
1503       return UnableToLegalize;
1504 
1505     // We now have the LHS and RHS of the compare split into narrow-type
1506     // registers, plus potentially some leftover type.
1507     Register Dst = MI.getOperand(0).getReg();
1508     LLT ResTy = MRI.getType(Dst);
1509     if (ICmpInst::isEquality(Pred)) {
1510       // For each part on the LHS and RHS, keep track of the result of XOR-ing
1511       // them together. For each equal part, the result should be all 0s. For
1512       // each non-equal part, we'll get at least one 1.
1513       auto Zero = MIRBuilder.buildConstant(NarrowTy, 0);
1514       SmallVector<Register, 4> Xors;
1515       for (auto LHSAndRHS : zip(LHSPartRegs, RHSPartRegs)) {
1516         auto LHS = std::get<0>(LHSAndRHS);
1517         auto RHS = std::get<1>(LHSAndRHS);
1518         auto Xor = MIRBuilder.buildXor(NarrowTy, LHS, RHS).getReg(0);
1519         Xors.push_back(Xor);
1520       }
1521 
1522       // Build a G_XOR for each leftover register. Each G_XOR must be widened
1523       // to the desired narrow type so that we can OR them together later.
1524       SmallVector<Register, 4> WidenedXors;
1525       for (auto LHSAndRHS : zip(LHSLeftoverRegs, RHSLeftoverRegs)) {
1526         auto LHS = std::get<0>(LHSAndRHS);
1527         auto RHS = std::get<1>(LHSAndRHS);
1528         auto Xor = MIRBuilder.buildXor(LeftoverTy, LHS, RHS).getReg(0);
1529         LLT GCDTy = extractGCDType(WidenedXors, NarrowTy, LeftoverTy, Xor);
1530         buildLCMMergePieces(LeftoverTy, NarrowTy, GCDTy, WidenedXors,
1531                             /* PadStrategy = */ TargetOpcode::G_ZEXT);
1532         Xors.insert(Xors.end(), WidenedXors.begin(), WidenedXors.end());
1533       }
1534 
1535       // Now, for each part we broke up, we know if they are equal/not equal
1536       // based off the G_XOR. We can OR these all together and compare against
1537       // 0 to get the result.
1538       assert(Xors.size() >= 2 && "Should have gotten at least two Xors?");
1539       auto Or = MIRBuilder.buildOr(NarrowTy, Xors[0], Xors[1]);
1540       for (unsigned I = 2, E = Xors.size(); I < E; ++I)
1541         Or = MIRBuilder.buildOr(NarrowTy, Or, Xors[I]);
1542       MIRBuilder.buildICmp(Pred, Dst, Or, Zero);
1543     } else {
1544       // TODO: Handle non-power-of-two types.
1545       assert(LHSPartRegs.size() == 2 && "Expected exactly 2 LHS part regs?");
1546       assert(RHSPartRegs.size() == 2 && "Expected exactly 2 RHS part regs?");
1547       Register LHSL = LHSPartRegs[0];
1548       Register LHSH = LHSPartRegs[1];
1549       Register RHSL = RHSPartRegs[0];
1550       Register RHSH = RHSPartRegs[1];
1551       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
1552       MachineInstrBuilder CmpHEQ =
1553           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
1554       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
1555           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
1556       MIRBuilder.buildSelect(Dst, CmpHEQ, CmpLU, CmpH);
1557     }
1558     MI.eraseFromParent();
1559     return Legalized;
1560   }
1561   case TargetOpcode::G_SEXT_INREG: {
1562     if (TypeIdx != 0)
1563       return UnableToLegalize;
1564 
1565     int64_t SizeInBits = MI.getOperand(2).getImm();
1566 
1567     // So long as the new type has more bits than the bits we're extending we
1568     // don't need to break it apart.
1569     if (NarrowTy.getScalarSizeInBits() > SizeInBits) {
1570       Observer.changingInstr(MI);
1571       // We don't lose any non-extension bits by truncating the src and
1572       // sign-extending the dst.
1573       MachineOperand &MO1 = MI.getOperand(1);
1574       auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
1575       MO1.setReg(TruncMIB.getReg(0));
1576 
1577       MachineOperand &MO2 = MI.getOperand(0);
1578       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
1579       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1580       MIRBuilder.buildSExt(MO2, DstExt);
1581       MO2.setReg(DstExt);
1582       Observer.changedInstr(MI);
1583       return Legalized;
1584     }
1585 
1586     // Break it apart. Components below the extension point are unmodified. The
1587     // component containing the extension point becomes a narrower SEXT_INREG.
1588     // Components above it are ashr'd from the component containing the
1589     // extension point.
1590     if (SizeOp0 % NarrowSize != 0)
1591       return UnableToLegalize;
1592     int NumParts = SizeOp0 / NarrowSize;
1593 
1594     // List the registers where the destination will be scattered.
1595     SmallVector<Register, 2> DstRegs;
1596     // List the registers where the source will be split.
1597     SmallVector<Register, 2> SrcRegs;
1598 
1599     // Create all the temporary registers.
1600     for (int i = 0; i < NumParts; ++i) {
1601       Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
1602 
1603       SrcRegs.push_back(SrcReg);
1604     }
1605 
1606     // Explode the big arguments into smaller chunks.
1607     MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
1608 
1609     Register AshrCstReg =
1610         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
1611             .getReg(0);
1612     Register FullExtensionReg;
1613     Register PartialExtensionReg;
1614 
1615     // Do the operation on each small part.
1616     for (int i = 0; i < NumParts; ++i) {
1617       if ((i + 1) * NarrowTy.getScalarSizeInBits() <= SizeInBits) {
1618         DstRegs.push_back(SrcRegs[i]);
1619         PartialExtensionReg = DstRegs.back();
1620       } else if (i * NarrowTy.getScalarSizeInBits() >= SizeInBits) {
1621         assert(PartialExtensionReg &&
1622                "Expected to visit partial extension before full");
1623         if (FullExtensionReg) {
1624           DstRegs.push_back(FullExtensionReg);
1625           continue;
1626         }
1627         DstRegs.push_back(
1628             MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
1629                 .getReg(0));
1630         FullExtensionReg = DstRegs.back();
1631       } else {
1632         DstRegs.push_back(
1633             MIRBuilder
1634                 .buildInstr(
1635                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
1636                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
1637                 .getReg(0));
1638         PartialExtensionReg = DstRegs.back();
1639       }
1640     }
1641 
1642     // Gather the destination registers into the final destination.
1643     Register DstReg = MI.getOperand(0).getReg();
1644     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
1645     MI.eraseFromParent();
1646     return Legalized;
1647   }
1648   case TargetOpcode::G_BSWAP:
1649   case TargetOpcode::G_BITREVERSE: {
1650     if (SizeOp0 % NarrowSize != 0)
1651       return UnableToLegalize;
1652 
1653     Observer.changingInstr(MI);
1654     SmallVector<Register, 2> SrcRegs, DstRegs;
1655     unsigned NumParts = SizeOp0 / NarrowSize;
1656     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
1657                  MIRBuilder, MRI);
1658 
1659     for (unsigned i = 0; i < NumParts; ++i) {
1660       auto DstPart = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
1661                                            {SrcRegs[NumParts - 1 - i]});
1662       DstRegs.push_back(DstPart.getReg(0));
1663     }
1664 
1665     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), DstRegs);
1666 
1667     Observer.changedInstr(MI);
1668     MI.eraseFromParent();
1669     return Legalized;
1670   }
1671   case TargetOpcode::G_PTR_ADD:
1672   case TargetOpcode::G_PTRMASK: {
1673     if (TypeIdx != 1)
1674       return UnableToLegalize;
1675     Observer.changingInstr(MI);
1676     narrowScalarSrc(MI, NarrowTy, 2);
1677     Observer.changedInstr(MI);
1678     return Legalized;
1679   }
1680   case TargetOpcode::G_FPTOUI:
1681   case TargetOpcode::G_FPTOSI:
1682     return narrowScalarFPTOI(MI, TypeIdx, NarrowTy);
1683   case TargetOpcode::G_FPEXT:
1684     if (TypeIdx != 0)
1685       return UnableToLegalize;
1686     Observer.changingInstr(MI);
1687     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_FPEXT);
1688     Observer.changedInstr(MI);
1689     return Legalized;
1690   case TargetOpcode::G_FLDEXP:
1691   case TargetOpcode::G_STRICT_FLDEXP:
1692     return narrowScalarFLDEXP(MI, TypeIdx, NarrowTy);
1693   }
1694 }
1695 
1696 Register LegalizerHelper::coerceToScalar(Register Val) {
1697   LLT Ty = MRI.getType(Val);
1698   if (Ty.isScalar())
1699     return Val;
1700 
1701   const DataLayout &DL = MIRBuilder.getDataLayout();
1702   LLT NewTy = LLT::scalar(Ty.getSizeInBits());
1703   if (Ty.isPointer()) {
1704     if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
1705       return Register();
1706     return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
1707   }
1708 
1709   Register NewVal = Val;
1710 
1711   assert(Ty.isVector());
1712   LLT EltTy = Ty.getElementType();
1713   if (EltTy.isPointer())
1714     NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
1715   return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
1716 }
1717 
1718 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
1719                                      unsigned OpIdx, unsigned ExtOpcode) {
1720   MachineOperand &MO = MI.getOperand(OpIdx);
1721   auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
1722   MO.setReg(ExtB.getReg(0));
1723 }
1724 
1725 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
1726                                       unsigned OpIdx) {
1727   MachineOperand &MO = MI.getOperand(OpIdx);
1728   auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
1729   MO.setReg(ExtB.getReg(0));
1730 }
1731 
1732 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
1733                                      unsigned OpIdx, unsigned TruncOpcode) {
1734   MachineOperand &MO = MI.getOperand(OpIdx);
1735   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1736   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1737   MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
1738   MO.setReg(DstExt);
1739 }
1740 
1741 void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
1742                                       unsigned OpIdx, unsigned ExtOpcode) {
1743   MachineOperand &MO = MI.getOperand(OpIdx);
1744   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
1745   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1746   MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
1747   MO.setReg(DstTrunc);
1748 }
1749 
1750 void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
1751                                             unsigned OpIdx) {
1752   MachineOperand &MO = MI.getOperand(OpIdx);
1753   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1754   Register Dst = MO.getReg();
1755   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
1756   MO.setReg(DstExt);
1757   MIRBuilder.buildDeleteTrailingVectorElements(Dst, DstExt);
1758 }
1759 
1760 void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
1761                                             unsigned OpIdx) {
1762   MachineOperand &MO = MI.getOperand(OpIdx);
1763   SmallVector<Register, 8> Regs;
1764   MO.setReg(MIRBuilder.buildPadVectorWithUndefElements(MoreTy, MO).getReg(0));
1765 }
1766 
1767 void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1768   MachineOperand &Op = MI.getOperand(OpIdx);
1769   Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
1770 }
1771 
1772 void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
1773   MachineOperand &MO = MI.getOperand(OpIdx);
1774   Register CastDst = MRI.createGenericVirtualRegister(CastTy);
1775   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
1776   MIRBuilder.buildBitcast(MO, CastDst);
1777   MO.setReg(CastDst);
1778 }
1779 
1780 LegalizerHelper::LegalizeResult
1781 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
1782                                         LLT WideTy) {
1783   if (TypeIdx != 1)
1784     return UnableToLegalize;
1785 
1786   auto [DstReg, DstTy, Src1Reg, Src1Ty] = MI.getFirst2RegLLTs();
1787   if (DstTy.isVector())
1788     return UnableToLegalize;
1789 
1790   LLT SrcTy = MRI.getType(Src1Reg);
1791   const int DstSize = DstTy.getSizeInBits();
1792   const int SrcSize = SrcTy.getSizeInBits();
1793   const int WideSize = WideTy.getSizeInBits();
1794   const int NumMerge = (DstSize + WideSize - 1) / WideSize;
1795 
1796   unsigned NumOps = MI.getNumOperands();
1797   unsigned NumSrc = MI.getNumOperands() - 1;
1798   unsigned PartSize = DstTy.getSizeInBits() / NumSrc;
1799 
1800   if (WideSize >= DstSize) {
1801     // Directly pack the bits in the target type.
1802     Register ResultReg = MIRBuilder.buildZExt(WideTy, Src1Reg).getReg(0);
1803 
1804     for (unsigned I = 2; I != NumOps; ++I) {
1805       const unsigned Offset = (I - 1) * PartSize;
1806 
1807       Register SrcReg = MI.getOperand(I).getReg();
1808       assert(MRI.getType(SrcReg) == LLT::scalar(PartSize));
1809 
1810       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
1811 
1812       Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
1813         MRI.createGenericVirtualRegister(WideTy);
1814 
1815       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
1816       auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
1817       MIRBuilder.buildOr(NextResult, ResultReg, Shl);
1818       ResultReg = NextResult;
1819     }
1820 
1821     if (WideSize > DstSize)
1822       MIRBuilder.buildTrunc(DstReg, ResultReg);
1823     else if (DstTy.isPointer())
1824       MIRBuilder.buildIntToPtr(DstReg, ResultReg);
1825 
1826     MI.eraseFromParent();
1827     return Legalized;
1828   }
1829 
1830   // Unmerge the original values to the GCD type, and recombine to the next
1831   // multiple greater than the original type.
1832   //
1833   // %3:_(s12) = G_MERGE_VALUES %0:_(s4), %1:_(s4), %2:_(s4) -> s6
1834   // %4:_(s2), %5:_(s2) = G_UNMERGE_VALUES %0
1835   // %6:_(s2), %7:_(s2) = G_UNMERGE_VALUES %1
1836   // %8:_(s2), %9:_(s2) = G_UNMERGE_VALUES %2
1837   // %10:_(s6) = G_MERGE_VALUES %4, %5, %6
1838   // %11:_(s6) = G_MERGE_VALUES %7, %8, %9
1839   // %12:_(s12) = G_MERGE_VALUES %10, %11
1840   //
1841   // Padding with undef if necessary:
1842   //
1843   // %2:_(s8) = G_MERGE_VALUES %0:_(s4), %1:_(s4) -> s6
1844   // %3:_(s2), %4:_(s2) = G_UNMERGE_VALUES %0
1845   // %5:_(s2), %6:_(s2) = G_UNMERGE_VALUES %1
1846   // %7:_(s2) = G_IMPLICIT_DEF
1847   // %8:_(s6) = G_MERGE_VALUES %3, %4, %5
1848   // %9:_(s6) = G_MERGE_VALUES %6, %7, %7
1849   // %10:_(s12) = G_MERGE_VALUES %8, %9
1850 
1851   const int GCD = std::gcd(SrcSize, WideSize);
1852   LLT GCDTy = LLT::scalar(GCD);
1853 
1854   SmallVector<Register, 8> Parts;
1855   SmallVector<Register, 8> NewMergeRegs;
1856   SmallVector<Register, 8> Unmerges;
1857   LLT WideDstTy = LLT::scalar(NumMerge * WideSize);
1858 
1859   // Decompose the original operands if they don't evenly divide.
1860   for (const MachineOperand &MO : llvm::drop_begin(MI.operands())) {
1861     Register SrcReg = MO.getReg();
1862     if (GCD == SrcSize) {
1863       Unmerges.push_back(SrcReg);
1864     } else {
1865       auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
1866       for (int J = 0, JE = Unmerge->getNumOperands() - 1; J != JE; ++J)
1867         Unmerges.push_back(Unmerge.getReg(J));
1868     }
1869   }
1870 
1871   // Pad with undef to the next size that is a multiple of the requested size.
1872   if (static_cast<int>(Unmerges.size()) != NumMerge * WideSize) {
1873     Register UndefReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
1874     for (int I = Unmerges.size(); I != NumMerge * WideSize; ++I)
1875       Unmerges.push_back(UndefReg);
1876   }
1877 
1878   const int PartsPerGCD = WideSize / GCD;
1879 
1880   // Build merges of each piece.
1881   ArrayRef<Register> Slicer(Unmerges);
1882   for (int I = 0; I != NumMerge; ++I, Slicer = Slicer.drop_front(PartsPerGCD)) {
1883     auto Merge =
1884         MIRBuilder.buildMergeLikeInstr(WideTy, Slicer.take_front(PartsPerGCD));
1885     NewMergeRegs.push_back(Merge.getReg(0));
1886   }
1887 
1888   // A truncate may be necessary if the requested type doesn't evenly divide the
1889   // original result type.
1890   if (DstTy.getSizeInBits() == WideDstTy.getSizeInBits()) {
1891     MIRBuilder.buildMergeLikeInstr(DstReg, NewMergeRegs);
1892   } else {
1893     auto FinalMerge = MIRBuilder.buildMergeLikeInstr(WideDstTy, NewMergeRegs);
1894     MIRBuilder.buildTrunc(DstReg, FinalMerge.getReg(0));
1895   }
1896 
1897   MI.eraseFromParent();
1898   return Legalized;
1899 }
1900 
1901 LegalizerHelper::LegalizeResult
1902 LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
1903                                           LLT WideTy) {
1904   if (TypeIdx != 0)
1905     return UnableToLegalize;
1906 
1907   int NumDst = MI.getNumOperands() - 1;
1908   Register SrcReg = MI.getOperand(NumDst).getReg();
1909   LLT SrcTy = MRI.getType(SrcReg);
1910   if (SrcTy.isVector())
1911     return UnableToLegalize;
1912 
1913   Register Dst0Reg = MI.getOperand(0).getReg();
1914   LLT DstTy = MRI.getType(Dst0Reg);
1915   if (!DstTy.isScalar())
1916     return UnableToLegalize;
1917 
1918   if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
1919     if (SrcTy.isPointer()) {
1920       const DataLayout &DL = MIRBuilder.getDataLayout();
1921       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
1922         LLVM_DEBUG(
1923             dbgs() << "Not casting non-integral address space integer\n");
1924         return UnableToLegalize;
1925       }
1926 
1927       SrcTy = LLT::scalar(SrcTy.getSizeInBits());
1928       SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
1929     }
1930 
1931     // Widen SrcTy to WideTy. This does not affect the result, but since the
1932     // user requested this size, it is probably better handled than SrcTy and
1933     // should reduce the total number of legalization artifacts.
1934     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
1935       SrcTy = WideTy;
1936       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
1937     }
1938 
1939     // Theres no unmerge type to target. Directly extract the bits from the
1940     // source type
1941     unsigned DstSize = DstTy.getSizeInBits();
1942 
1943     MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
1944     for (int I = 1; I != NumDst; ++I) {
1945       auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
1946       auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
1947       MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
1948     }
1949 
1950     MI.eraseFromParent();
1951     return Legalized;
1952   }
1953 
1954   // Extend the source to a wider type.
1955   LLT LCMTy = getLCMType(SrcTy, WideTy);
1956 
1957   Register WideSrc = SrcReg;
1958   if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
1959     // TODO: If this is an integral address space, cast to integer and anyext.
1960     if (SrcTy.isPointer()) {
1961       LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
1962       return UnableToLegalize;
1963     }
1964 
1965     WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
1966   }
1967 
1968   auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
1969 
1970   // Create a sequence of unmerges and merges to the original results. Since we
1971   // may have widened the source, we will need to pad the results with dead defs
1972   // to cover the source register.
1973   // e.g. widen s48 to s64:
1974   // %1:_(s48), %2:_(s48) = G_UNMERGE_VALUES %0:_(s96)
1975   //
1976   // =>
1977   //  %4:_(s192) = G_ANYEXT %0:_(s96)
1978   //  %5:_(s64), %6, %7 = G_UNMERGE_VALUES %4 ; Requested unmerge
1979   //  ; unpack to GCD type, with extra dead defs
1980   //  %8:_(s16), %9, %10, %11 = G_UNMERGE_VALUES %5:_(s64)
1981   //  %12:_(s16), %13, dead %14, dead %15 = G_UNMERGE_VALUES %6:_(s64)
1982   //  dead %16:_(s16), dead %17, dead %18, dead %18 = G_UNMERGE_VALUES %7:_(s64)
1983   //  %1:_(s48) = G_MERGE_VALUES %8:_(s16), %9, %10   ; Remerge to destination
1984   //  %2:_(s48) = G_MERGE_VALUES %11:_(s16), %12, %13 ; Remerge to destination
1985   const LLT GCDTy = getGCDType(WideTy, DstTy);
1986   const int NumUnmerge = Unmerge->getNumOperands() - 1;
1987   const int PartsPerRemerge = DstTy.getSizeInBits() / GCDTy.getSizeInBits();
1988 
1989   // Directly unmerge to the destination without going through a GCD type
1990   // if possible
1991   if (PartsPerRemerge == 1) {
1992     const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
1993 
1994     for (int I = 0; I != NumUnmerge; ++I) {
1995       auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
1996 
1997       for (int J = 0; J != PartsPerUnmerge; ++J) {
1998         int Idx = I * PartsPerUnmerge + J;
1999         if (Idx < NumDst)
2000           MIB.addDef(MI.getOperand(Idx).getReg());
2001         else {
2002           // Create dead def for excess components.
2003           MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
2004         }
2005       }
2006 
2007       MIB.addUse(Unmerge.getReg(I));
2008     }
2009   } else {
2010     SmallVector<Register, 16> Parts;
2011     for (int J = 0; J != NumUnmerge; ++J)
2012       extractGCDType(Parts, GCDTy, Unmerge.getReg(J));
2013 
2014     SmallVector<Register, 8> RemergeParts;
2015     for (int I = 0; I != NumDst; ++I) {
2016       for (int J = 0; J < PartsPerRemerge; ++J) {
2017         const int Idx = I * PartsPerRemerge + J;
2018         RemergeParts.emplace_back(Parts[Idx]);
2019       }
2020 
2021       MIRBuilder.buildMergeLikeInstr(MI.getOperand(I).getReg(), RemergeParts);
2022       RemergeParts.clear();
2023     }
2024   }
2025 
2026   MI.eraseFromParent();
2027   return Legalized;
2028 }
2029 
2030 LegalizerHelper::LegalizeResult
2031 LegalizerHelper::widenScalarExtract(MachineInstr &MI, unsigned TypeIdx,
2032                                     LLT WideTy) {
2033   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
2034   unsigned Offset = MI.getOperand(2).getImm();
2035 
2036   if (TypeIdx == 0) {
2037     if (SrcTy.isVector() || DstTy.isVector())
2038       return UnableToLegalize;
2039 
2040     SrcOp Src(SrcReg);
2041     if (SrcTy.isPointer()) {
2042       // Extracts from pointers can be handled only if they are really just
2043       // simple integers.
2044       const DataLayout &DL = MIRBuilder.getDataLayout();
2045       if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace()))
2046         return UnableToLegalize;
2047 
2048       LLT SrcAsIntTy = LLT::scalar(SrcTy.getSizeInBits());
2049       Src = MIRBuilder.buildPtrToInt(SrcAsIntTy, Src);
2050       SrcTy = SrcAsIntTy;
2051     }
2052 
2053     if (DstTy.isPointer())
2054       return UnableToLegalize;
2055 
2056     if (Offset == 0) {
2057       // Avoid a shift in the degenerate case.
2058       MIRBuilder.buildTrunc(DstReg,
2059                             MIRBuilder.buildAnyExtOrTrunc(WideTy, Src));
2060       MI.eraseFromParent();
2061       return Legalized;
2062     }
2063 
2064     // Do a shift in the source type.
2065     LLT ShiftTy = SrcTy;
2066     if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
2067       Src = MIRBuilder.buildAnyExt(WideTy, Src);
2068       ShiftTy = WideTy;
2069     }
2070 
2071     auto LShr = MIRBuilder.buildLShr(
2072       ShiftTy, Src, MIRBuilder.buildConstant(ShiftTy, Offset));
2073     MIRBuilder.buildTrunc(DstReg, LShr);
2074     MI.eraseFromParent();
2075     return Legalized;
2076   }
2077 
2078   if (SrcTy.isScalar()) {
2079     Observer.changingInstr(MI);
2080     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2081     Observer.changedInstr(MI);
2082     return Legalized;
2083   }
2084 
2085   if (!SrcTy.isVector())
2086     return UnableToLegalize;
2087 
2088   if (DstTy != SrcTy.getElementType())
2089     return UnableToLegalize;
2090 
2091   if (Offset % SrcTy.getScalarSizeInBits() != 0)
2092     return UnableToLegalize;
2093 
2094   Observer.changingInstr(MI);
2095   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2096 
2097   MI.getOperand(2).setImm((WideTy.getSizeInBits() / SrcTy.getSizeInBits()) *
2098                           Offset);
2099   widenScalarDst(MI, WideTy.getScalarType(), 0);
2100   Observer.changedInstr(MI);
2101   return Legalized;
2102 }
2103 
2104 LegalizerHelper::LegalizeResult
2105 LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
2106                                    LLT WideTy) {
2107   if (TypeIdx != 0 || WideTy.isVector())
2108     return UnableToLegalize;
2109   Observer.changingInstr(MI);
2110   widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2111   widenScalarDst(MI, WideTy);
2112   Observer.changedInstr(MI);
2113   return Legalized;
2114 }
2115 
2116 LegalizerHelper::LegalizeResult
2117 LegalizerHelper::widenScalarAddSubOverflow(MachineInstr &MI, unsigned TypeIdx,
2118                                            LLT WideTy) {
2119   unsigned Opcode;
2120   unsigned ExtOpcode;
2121   std::optional<Register> CarryIn;
2122   switch (MI.getOpcode()) {
2123   default:
2124     llvm_unreachable("Unexpected opcode!");
2125   case TargetOpcode::G_SADDO:
2126     Opcode = TargetOpcode::G_ADD;
2127     ExtOpcode = TargetOpcode::G_SEXT;
2128     break;
2129   case TargetOpcode::G_SSUBO:
2130     Opcode = TargetOpcode::G_SUB;
2131     ExtOpcode = TargetOpcode::G_SEXT;
2132     break;
2133   case TargetOpcode::G_UADDO:
2134     Opcode = TargetOpcode::G_ADD;
2135     ExtOpcode = TargetOpcode::G_ZEXT;
2136     break;
2137   case TargetOpcode::G_USUBO:
2138     Opcode = TargetOpcode::G_SUB;
2139     ExtOpcode = TargetOpcode::G_ZEXT;
2140     break;
2141   case TargetOpcode::G_SADDE:
2142     Opcode = TargetOpcode::G_UADDE;
2143     ExtOpcode = TargetOpcode::G_SEXT;
2144     CarryIn = MI.getOperand(4).getReg();
2145     break;
2146   case TargetOpcode::G_SSUBE:
2147     Opcode = TargetOpcode::G_USUBE;
2148     ExtOpcode = TargetOpcode::G_SEXT;
2149     CarryIn = MI.getOperand(4).getReg();
2150     break;
2151   case TargetOpcode::G_UADDE:
2152     Opcode = TargetOpcode::G_UADDE;
2153     ExtOpcode = TargetOpcode::G_ZEXT;
2154     CarryIn = MI.getOperand(4).getReg();
2155     break;
2156   case TargetOpcode::G_USUBE:
2157     Opcode = TargetOpcode::G_USUBE;
2158     ExtOpcode = TargetOpcode::G_ZEXT;
2159     CarryIn = MI.getOperand(4).getReg();
2160     break;
2161   }
2162 
2163   if (TypeIdx == 1) {
2164     unsigned BoolExtOp = MIRBuilder.getBoolExtOp(WideTy.isVector(), false);
2165 
2166     Observer.changingInstr(MI);
2167     if (CarryIn)
2168       widenScalarSrc(MI, WideTy, 4, BoolExtOp);
2169     widenScalarDst(MI, WideTy, 1);
2170 
2171     Observer.changedInstr(MI);
2172     return Legalized;
2173   }
2174 
2175   auto LHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(2)});
2176   auto RHSExt = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MI.getOperand(3)});
2177   // Do the arithmetic in the larger type.
2178   Register NewOp;
2179   if (CarryIn) {
2180     LLT CarryOutTy = MRI.getType(MI.getOperand(1).getReg());
2181     NewOp = MIRBuilder
2182                 .buildInstr(Opcode, {WideTy, CarryOutTy},
2183                             {LHSExt, RHSExt, *CarryIn})
2184                 .getReg(0);
2185   } else {
2186     NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSExt, RHSExt}).getReg(0);
2187   }
2188   LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
2189   auto TruncOp = MIRBuilder.buildTrunc(OrigTy, NewOp);
2190   auto ExtOp = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {TruncOp});
2191   // There is no overflow if the ExtOp is the same as NewOp.
2192   MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, ExtOp);
2193   // Now trunc the NewOp to the original result.
2194   MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
2195   MI.eraseFromParent();
2196   return Legalized;
2197 }
2198 
2199 LegalizerHelper::LegalizeResult
2200 LegalizerHelper::widenScalarAddSubShlSat(MachineInstr &MI, unsigned TypeIdx,
2201                                          LLT WideTy) {
2202   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
2203                   MI.getOpcode() == TargetOpcode::G_SSUBSAT ||
2204                   MI.getOpcode() == TargetOpcode::G_SSHLSAT;
2205   bool IsShift = MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
2206                  MI.getOpcode() == TargetOpcode::G_USHLSAT;
2207   // We can convert this to:
2208   //   1. Any extend iN to iM
2209   //   2. SHL by M-N
2210   //   3. [US][ADD|SUB|SHL]SAT
2211   //   4. L/ASHR by M-N
2212   //
2213   // It may be more efficient to lower this to a min and a max operation in
2214   // the higher precision arithmetic if the promoted operation isn't legal,
2215   // but this decision is up to the target's lowering request.
2216   Register DstReg = MI.getOperand(0).getReg();
2217 
2218   unsigned NewBits = WideTy.getScalarSizeInBits();
2219   unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
2220 
2221   // Shifts must zero-extend the RHS to preserve the unsigned quantity, and
2222   // must not left shift the RHS to preserve the shift amount.
2223   auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
2224   auto RHS = IsShift ? MIRBuilder.buildZExt(WideTy, MI.getOperand(2))
2225                      : MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
2226   auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
2227   auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
2228   auto ShiftR = IsShift ? RHS : MIRBuilder.buildShl(WideTy, RHS, ShiftK);
2229 
2230   auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
2231                                         {ShiftL, ShiftR}, MI.getFlags());
2232 
2233   // Use a shift that will preserve the number of sign bits when the trunc is
2234   // folded away.
2235   auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
2236                          : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
2237 
2238   MIRBuilder.buildTrunc(DstReg, Result);
2239   MI.eraseFromParent();
2240   return Legalized;
2241 }
2242 
2243 LegalizerHelper::LegalizeResult
2244 LegalizerHelper::widenScalarMulo(MachineInstr &MI, unsigned TypeIdx,
2245                                  LLT WideTy) {
2246   if (TypeIdx == 1) {
2247     Observer.changingInstr(MI);
2248     widenScalarDst(MI, WideTy, 1);
2249     Observer.changedInstr(MI);
2250     return Legalized;
2251   }
2252 
2253   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULO;
2254   auto [Result, OriginalOverflow, LHS, RHS] = MI.getFirst4Regs();
2255   LLT SrcTy = MRI.getType(LHS);
2256   LLT OverflowTy = MRI.getType(OriginalOverflow);
2257   unsigned SrcBitWidth = SrcTy.getScalarSizeInBits();
2258 
2259   // To determine if the result overflowed in the larger type, we extend the
2260   // input to the larger type, do the multiply (checking if it overflows),
2261   // then also check the high bits of the result to see if overflow happened
2262   // there.
2263   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2264   auto LeftOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {LHS});
2265   auto RightOperand = MIRBuilder.buildInstr(ExtOp, {WideTy}, {RHS});
2266 
2267   // Multiplication cannot overflow if the WideTy is >= 2 * original width,
2268   // so we don't need to check the overflow result of larger type Mulo.
2269   bool WideMulCanOverflow = WideTy.getScalarSizeInBits() < 2 * SrcBitWidth;
2270 
2271   unsigned MulOpc =
2272       WideMulCanOverflow ? MI.getOpcode() : (unsigned)TargetOpcode::G_MUL;
2273 
2274   MachineInstrBuilder Mulo;
2275   if (WideMulCanOverflow)
2276     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy, OverflowTy},
2277                                  {LeftOperand, RightOperand});
2278   else
2279     Mulo = MIRBuilder.buildInstr(MulOpc, {WideTy}, {LeftOperand, RightOperand});
2280 
2281   auto Mul = Mulo->getOperand(0);
2282   MIRBuilder.buildTrunc(Result, Mul);
2283 
2284   MachineInstrBuilder ExtResult;
2285   // Overflow occurred if it occurred in the larger type, or if the high part
2286   // of the result does not zero/sign-extend the low part.  Check this second
2287   // possibility first.
2288   if (IsSigned) {
2289     // For signed, overflow occurred when the high part does not sign-extend
2290     // the low part.
2291     ExtResult = MIRBuilder.buildSExtInReg(WideTy, Mul, SrcBitWidth);
2292   } else {
2293     // Unsigned overflow occurred when the high part does not zero-extend the
2294     // low part.
2295     ExtResult = MIRBuilder.buildZExtInReg(WideTy, Mul, SrcBitWidth);
2296   }
2297 
2298   if (WideMulCanOverflow) {
2299     auto Overflow =
2300         MIRBuilder.buildICmp(CmpInst::ICMP_NE, OverflowTy, Mul, ExtResult);
2301     // Finally check if the multiplication in the larger type itself overflowed.
2302     MIRBuilder.buildOr(OriginalOverflow, Mulo->getOperand(1), Overflow);
2303   } else {
2304     MIRBuilder.buildICmp(CmpInst::ICMP_NE, OriginalOverflow, Mul, ExtResult);
2305   }
2306   MI.eraseFromParent();
2307   return Legalized;
2308 }
2309 
2310 LegalizerHelper::LegalizeResult
2311 LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
2312   switch (MI.getOpcode()) {
2313   default:
2314     return UnableToLegalize;
2315   case TargetOpcode::G_ATOMICRMW_XCHG:
2316   case TargetOpcode::G_ATOMICRMW_ADD:
2317   case TargetOpcode::G_ATOMICRMW_SUB:
2318   case TargetOpcode::G_ATOMICRMW_AND:
2319   case TargetOpcode::G_ATOMICRMW_OR:
2320   case TargetOpcode::G_ATOMICRMW_XOR:
2321   case TargetOpcode::G_ATOMICRMW_MIN:
2322   case TargetOpcode::G_ATOMICRMW_MAX:
2323   case TargetOpcode::G_ATOMICRMW_UMIN:
2324   case TargetOpcode::G_ATOMICRMW_UMAX:
2325     assert(TypeIdx == 0 && "atomicrmw with second scalar type");
2326     Observer.changingInstr(MI);
2327     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2328     widenScalarDst(MI, WideTy, 0);
2329     Observer.changedInstr(MI);
2330     return Legalized;
2331   case TargetOpcode::G_ATOMIC_CMPXCHG:
2332     assert(TypeIdx == 0 && "G_ATOMIC_CMPXCHG with second scalar type");
2333     Observer.changingInstr(MI);
2334     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2335     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2336     widenScalarDst(MI, WideTy, 0);
2337     Observer.changedInstr(MI);
2338     return Legalized;
2339   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS:
2340     if (TypeIdx == 0) {
2341       Observer.changingInstr(MI);
2342       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2343       widenScalarSrc(MI, WideTy, 4, TargetOpcode::G_ANYEXT);
2344       widenScalarDst(MI, WideTy, 0);
2345       Observer.changedInstr(MI);
2346       return Legalized;
2347     }
2348     assert(TypeIdx == 1 &&
2349            "G_ATOMIC_CMPXCHG_WITH_SUCCESS with third scalar type");
2350     Observer.changingInstr(MI);
2351     widenScalarDst(MI, WideTy, 1);
2352     Observer.changedInstr(MI);
2353     return Legalized;
2354   case TargetOpcode::G_EXTRACT:
2355     return widenScalarExtract(MI, TypeIdx, WideTy);
2356   case TargetOpcode::G_INSERT:
2357     return widenScalarInsert(MI, TypeIdx, WideTy);
2358   case TargetOpcode::G_MERGE_VALUES:
2359     return widenScalarMergeValues(MI, TypeIdx, WideTy);
2360   case TargetOpcode::G_UNMERGE_VALUES:
2361     return widenScalarUnmergeValues(MI, TypeIdx, WideTy);
2362   case TargetOpcode::G_SADDO:
2363   case TargetOpcode::G_SSUBO:
2364   case TargetOpcode::G_UADDO:
2365   case TargetOpcode::G_USUBO:
2366   case TargetOpcode::G_SADDE:
2367   case TargetOpcode::G_SSUBE:
2368   case TargetOpcode::G_UADDE:
2369   case TargetOpcode::G_USUBE:
2370     return widenScalarAddSubOverflow(MI, TypeIdx, WideTy);
2371   case TargetOpcode::G_UMULO:
2372   case TargetOpcode::G_SMULO:
2373     return widenScalarMulo(MI, TypeIdx, WideTy);
2374   case TargetOpcode::G_SADDSAT:
2375   case TargetOpcode::G_SSUBSAT:
2376   case TargetOpcode::G_SSHLSAT:
2377   case TargetOpcode::G_UADDSAT:
2378   case TargetOpcode::G_USUBSAT:
2379   case TargetOpcode::G_USHLSAT:
2380     return widenScalarAddSubShlSat(MI, TypeIdx, WideTy);
2381   case TargetOpcode::G_CTTZ:
2382   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
2383   case TargetOpcode::G_CTLZ:
2384   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2385   case TargetOpcode::G_CTPOP: {
2386     if (TypeIdx == 0) {
2387       Observer.changingInstr(MI);
2388       widenScalarDst(MI, WideTy, 0);
2389       Observer.changedInstr(MI);
2390       return Legalized;
2391     }
2392 
2393     Register SrcReg = MI.getOperand(1).getReg();
2394 
2395     // First extend the input.
2396     unsigned ExtOpc = MI.getOpcode() == TargetOpcode::G_CTTZ ||
2397                               MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF
2398                           ? TargetOpcode::G_ANYEXT
2399                           : TargetOpcode::G_ZEXT;
2400     auto MIBSrc = MIRBuilder.buildInstr(ExtOpc, {WideTy}, {SrcReg});
2401     LLT CurTy = MRI.getType(SrcReg);
2402     unsigned NewOpc = MI.getOpcode();
2403     if (NewOpc == TargetOpcode::G_CTTZ) {
2404       // The count is the same in the larger type except if the original
2405       // value was zero.  This can be handled by setting the bit just off
2406       // the top of the original type.
2407       auto TopBit =
2408           APInt::getOneBitSet(WideTy.getSizeInBits(), CurTy.getSizeInBits());
2409       MIBSrc = MIRBuilder.buildOr(
2410         WideTy, MIBSrc, MIRBuilder.buildConstant(WideTy, TopBit));
2411       // Now we know the operand is non-zero, use the more relaxed opcode.
2412       NewOpc = TargetOpcode::G_CTTZ_ZERO_UNDEF;
2413     }
2414 
2415     // Perform the operation at the larger size.
2416     auto MIBNewOp = MIRBuilder.buildInstr(NewOpc, {WideTy}, {MIBSrc});
2417     // This is already the correct result for CTPOP and CTTZs
2418     if (MI.getOpcode() == TargetOpcode::G_CTLZ ||
2419         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
2420       // The correct result is NewOp - (Difference in widety and current ty).
2421       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
2422       MIBNewOp = MIRBuilder.buildSub(
2423           WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
2424     }
2425 
2426     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
2427     MI.eraseFromParent();
2428     return Legalized;
2429   }
2430   case TargetOpcode::G_BSWAP: {
2431     Observer.changingInstr(MI);
2432     Register DstReg = MI.getOperand(0).getReg();
2433 
2434     Register ShrReg = MRI.createGenericVirtualRegister(WideTy);
2435     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2436     Register ShiftAmtReg = MRI.createGenericVirtualRegister(WideTy);
2437     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2438 
2439     MI.getOperand(0).setReg(DstExt);
2440 
2441     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2442 
2443     LLT Ty = MRI.getType(DstReg);
2444     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2445     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
2446     MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
2447 
2448     MIRBuilder.buildTrunc(DstReg, ShrReg);
2449     Observer.changedInstr(MI);
2450     return Legalized;
2451   }
2452   case TargetOpcode::G_BITREVERSE: {
2453     Observer.changingInstr(MI);
2454 
2455     Register DstReg = MI.getOperand(0).getReg();
2456     LLT Ty = MRI.getType(DstReg);
2457     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
2458 
2459     Register DstExt = MRI.createGenericVirtualRegister(WideTy);
2460     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2461     MI.getOperand(0).setReg(DstExt);
2462     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
2463 
2464     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
2465     auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
2466     MIRBuilder.buildTrunc(DstReg, Shift);
2467     Observer.changedInstr(MI);
2468     return Legalized;
2469   }
2470   case TargetOpcode::G_FREEZE:
2471     Observer.changingInstr(MI);
2472     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2473     widenScalarDst(MI, WideTy);
2474     Observer.changedInstr(MI);
2475     return Legalized;
2476 
2477   case TargetOpcode::G_ABS:
2478     Observer.changingInstr(MI);
2479     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2480     widenScalarDst(MI, WideTy);
2481     Observer.changedInstr(MI);
2482     return Legalized;
2483 
2484   case TargetOpcode::G_ADD:
2485   case TargetOpcode::G_AND:
2486   case TargetOpcode::G_MUL:
2487   case TargetOpcode::G_OR:
2488   case TargetOpcode::G_XOR:
2489   case TargetOpcode::G_SUB:
2490     // Perform operation at larger width (any extension is fines here, high bits
2491     // don't affect the result) and then truncate the result back to the
2492     // original type.
2493     Observer.changingInstr(MI);
2494     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2495     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2496     widenScalarDst(MI, WideTy);
2497     Observer.changedInstr(MI);
2498     return Legalized;
2499 
2500   case TargetOpcode::G_SBFX:
2501   case TargetOpcode::G_UBFX:
2502     Observer.changingInstr(MI);
2503 
2504     if (TypeIdx == 0) {
2505       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2506       widenScalarDst(MI, WideTy);
2507     } else {
2508       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2509       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2510     }
2511 
2512     Observer.changedInstr(MI);
2513     return Legalized;
2514 
2515   case TargetOpcode::G_SHL:
2516     Observer.changingInstr(MI);
2517 
2518     if (TypeIdx == 0) {
2519       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2520       widenScalarDst(MI, WideTy);
2521     } else {
2522       assert(TypeIdx == 1);
2523       // The "number of bits to shift" operand must preserve its value as an
2524       // unsigned integer:
2525       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2526     }
2527 
2528     Observer.changedInstr(MI);
2529     return Legalized;
2530 
2531   case TargetOpcode::G_ROTR:
2532   case TargetOpcode::G_ROTL:
2533     if (TypeIdx != 1)
2534       return UnableToLegalize;
2535 
2536     Observer.changingInstr(MI);
2537     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2538     Observer.changedInstr(MI);
2539     return Legalized;
2540 
2541   case TargetOpcode::G_SDIV:
2542   case TargetOpcode::G_SREM:
2543   case TargetOpcode::G_SMIN:
2544   case TargetOpcode::G_SMAX:
2545     Observer.changingInstr(MI);
2546     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2547     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2548     widenScalarDst(MI, WideTy);
2549     Observer.changedInstr(MI);
2550     return Legalized;
2551 
2552   case TargetOpcode::G_SDIVREM:
2553     Observer.changingInstr(MI);
2554     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2555     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2556     widenScalarDst(MI, WideTy);
2557     widenScalarDst(MI, WideTy, 1);
2558     Observer.changedInstr(MI);
2559     return Legalized;
2560 
2561   case TargetOpcode::G_ASHR:
2562   case TargetOpcode::G_LSHR:
2563     Observer.changingInstr(MI);
2564 
2565     if (TypeIdx == 0) {
2566       unsigned CvtOp = MI.getOpcode() == TargetOpcode::G_ASHR ?
2567         TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
2568 
2569       widenScalarSrc(MI, WideTy, 1, CvtOp);
2570       widenScalarDst(MI, WideTy);
2571     } else {
2572       assert(TypeIdx == 1);
2573       // The "number of bits to shift" operand must preserve its value as an
2574       // unsigned integer:
2575       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2576     }
2577 
2578     Observer.changedInstr(MI);
2579     return Legalized;
2580   case TargetOpcode::G_UDIV:
2581   case TargetOpcode::G_UREM:
2582   case TargetOpcode::G_UMIN:
2583   case TargetOpcode::G_UMAX:
2584     Observer.changingInstr(MI);
2585     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2586     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2587     widenScalarDst(MI, WideTy);
2588     Observer.changedInstr(MI);
2589     return Legalized;
2590 
2591   case TargetOpcode::G_UDIVREM:
2592     Observer.changingInstr(MI);
2593     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2594     widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ZEXT);
2595     widenScalarDst(MI, WideTy);
2596     widenScalarDst(MI, WideTy, 1);
2597     Observer.changedInstr(MI);
2598     return Legalized;
2599 
2600   case TargetOpcode::G_SELECT:
2601     Observer.changingInstr(MI);
2602     if (TypeIdx == 0) {
2603       // Perform operation at larger width (any extension is fine here, high
2604       // bits don't affect the result) and then truncate the result back to the
2605       // original type.
2606       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2607       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_ANYEXT);
2608       widenScalarDst(MI, WideTy);
2609     } else {
2610       bool IsVec = MRI.getType(MI.getOperand(1).getReg()).isVector();
2611       // Explicit extension is required here since high bits affect the result.
2612       widenScalarSrc(MI, WideTy, 1, MIRBuilder.getBoolExtOp(IsVec, false));
2613     }
2614     Observer.changedInstr(MI);
2615     return Legalized;
2616 
2617   case TargetOpcode::G_FPTOSI:
2618   case TargetOpcode::G_FPTOUI:
2619   case TargetOpcode::G_IS_FPCLASS:
2620     Observer.changingInstr(MI);
2621 
2622     if (TypeIdx == 0)
2623       widenScalarDst(MI, WideTy);
2624     else
2625       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2626 
2627     Observer.changedInstr(MI);
2628     return Legalized;
2629   case TargetOpcode::G_SITOFP:
2630     Observer.changingInstr(MI);
2631 
2632     if (TypeIdx == 0)
2633       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2634     else
2635       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_SEXT);
2636 
2637     Observer.changedInstr(MI);
2638     return Legalized;
2639   case TargetOpcode::G_UITOFP:
2640     Observer.changingInstr(MI);
2641 
2642     if (TypeIdx == 0)
2643       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2644     else
2645       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2646 
2647     Observer.changedInstr(MI);
2648     return Legalized;
2649   case TargetOpcode::G_LOAD:
2650   case TargetOpcode::G_SEXTLOAD:
2651   case TargetOpcode::G_ZEXTLOAD:
2652     Observer.changingInstr(MI);
2653     widenScalarDst(MI, WideTy);
2654     Observer.changedInstr(MI);
2655     return Legalized;
2656 
2657   case TargetOpcode::G_STORE: {
2658     if (TypeIdx != 0)
2659       return UnableToLegalize;
2660 
2661     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
2662     if (!Ty.isScalar())
2663       return UnableToLegalize;
2664 
2665     Observer.changingInstr(MI);
2666 
2667     unsigned ExtType = Ty.getScalarSizeInBits() == 1 ?
2668       TargetOpcode::G_ZEXT : TargetOpcode::G_ANYEXT;
2669     widenScalarSrc(MI, WideTy, 0, ExtType);
2670 
2671     Observer.changedInstr(MI);
2672     return Legalized;
2673   }
2674   case TargetOpcode::G_CONSTANT: {
2675     MachineOperand &SrcMO = MI.getOperand(1);
2676     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
2677     unsigned ExtOpc = LI.getExtOpcodeForWideningConstant(
2678         MRI.getType(MI.getOperand(0).getReg()));
2679     assert((ExtOpc == TargetOpcode::G_ZEXT || ExtOpc == TargetOpcode::G_SEXT ||
2680             ExtOpc == TargetOpcode::G_ANYEXT) &&
2681            "Illegal Extend");
2682     const APInt &SrcVal = SrcMO.getCImm()->getValue();
2683     const APInt &Val = (ExtOpc == TargetOpcode::G_SEXT)
2684                            ? SrcVal.sext(WideTy.getSizeInBits())
2685                            : SrcVal.zext(WideTy.getSizeInBits());
2686     Observer.changingInstr(MI);
2687     SrcMO.setCImm(ConstantInt::get(Ctx, Val));
2688 
2689     widenScalarDst(MI, WideTy);
2690     Observer.changedInstr(MI);
2691     return Legalized;
2692   }
2693   case TargetOpcode::G_FCONSTANT: {
2694     // To avoid changing the bits of the constant due to extension to a larger
2695     // type and then using G_FPTRUNC, we simply convert to a G_CONSTANT.
2696     MachineOperand &SrcMO = MI.getOperand(1);
2697     APInt Val = SrcMO.getFPImm()->getValueAPF().bitcastToAPInt();
2698     MIRBuilder.setInstrAndDebugLoc(MI);
2699     auto IntCst = MIRBuilder.buildConstant(MI.getOperand(0).getReg(), Val);
2700     widenScalarDst(*IntCst, WideTy, 0, TargetOpcode::G_TRUNC);
2701     MI.eraseFromParent();
2702     return Legalized;
2703   }
2704   case TargetOpcode::G_IMPLICIT_DEF: {
2705     Observer.changingInstr(MI);
2706     widenScalarDst(MI, WideTy);
2707     Observer.changedInstr(MI);
2708     return Legalized;
2709   }
2710   case TargetOpcode::G_BRCOND:
2711     Observer.changingInstr(MI);
2712     widenScalarSrc(MI, WideTy, 0, MIRBuilder.getBoolExtOp(false, false));
2713     Observer.changedInstr(MI);
2714     return Legalized;
2715 
2716   case TargetOpcode::G_FCMP:
2717     Observer.changingInstr(MI);
2718     if (TypeIdx == 0)
2719       widenScalarDst(MI, WideTy);
2720     else {
2721       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2722       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_FPEXT);
2723     }
2724     Observer.changedInstr(MI);
2725     return Legalized;
2726 
2727   case TargetOpcode::G_ICMP:
2728     Observer.changingInstr(MI);
2729     if (TypeIdx == 0)
2730       widenScalarDst(MI, WideTy);
2731     else {
2732       unsigned ExtOpcode = CmpInst::isSigned(static_cast<CmpInst::Predicate>(
2733                                MI.getOperand(1).getPredicate()))
2734                                ? TargetOpcode::G_SEXT
2735                                : TargetOpcode::G_ZEXT;
2736       widenScalarSrc(MI, WideTy, 2, ExtOpcode);
2737       widenScalarSrc(MI, WideTy, 3, ExtOpcode);
2738     }
2739     Observer.changedInstr(MI);
2740     return Legalized;
2741 
2742   case TargetOpcode::G_PTR_ADD:
2743     assert(TypeIdx == 1 && "unable to legalize pointer of G_PTR_ADD");
2744     Observer.changingInstr(MI);
2745     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2746     Observer.changedInstr(MI);
2747     return Legalized;
2748 
2749   case TargetOpcode::G_PHI: {
2750     assert(TypeIdx == 0 && "Expecting only Idx 0");
2751 
2752     Observer.changingInstr(MI);
2753     for (unsigned I = 1; I < MI.getNumOperands(); I += 2) {
2754       MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
2755       MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
2756       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_ANYEXT);
2757     }
2758 
2759     MachineBasicBlock &MBB = *MI.getParent();
2760     MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
2761     widenScalarDst(MI, WideTy);
2762     Observer.changedInstr(MI);
2763     return Legalized;
2764   }
2765   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
2766     if (TypeIdx == 0) {
2767       Register VecReg = MI.getOperand(1).getReg();
2768       LLT VecTy = MRI.getType(VecReg);
2769       Observer.changingInstr(MI);
2770 
2771       widenScalarSrc(
2772           MI, LLT::vector(VecTy.getElementCount(), WideTy.getSizeInBits()), 1,
2773           TargetOpcode::G_ANYEXT);
2774 
2775       widenScalarDst(MI, WideTy, 0);
2776       Observer.changedInstr(MI);
2777       return Legalized;
2778     }
2779 
2780     if (TypeIdx != 2)
2781       return UnableToLegalize;
2782     Observer.changingInstr(MI);
2783     // TODO: Probably should be zext
2784     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2785     Observer.changedInstr(MI);
2786     return Legalized;
2787   }
2788   case TargetOpcode::G_INSERT_VECTOR_ELT: {
2789     if (TypeIdx == 0) {
2790       Observer.changingInstr(MI);
2791       const LLT WideEltTy = WideTy.getElementType();
2792 
2793       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2794       widenScalarSrc(MI, WideEltTy, 2, TargetOpcode::G_ANYEXT);
2795       widenScalarDst(MI, WideTy, 0);
2796       Observer.changedInstr(MI);
2797       return Legalized;
2798     }
2799 
2800     if (TypeIdx == 1) {
2801       Observer.changingInstr(MI);
2802 
2803       Register VecReg = MI.getOperand(1).getReg();
2804       LLT VecTy = MRI.getType(VecReg);
2805       LLT WideVecTy = LLT::vector(VecTy.getElementCount(), WideTy);
2806 
2807       widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_ANYEXT);
2808       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ANYEXT);
2809       widenScalarDst(MI, WideVecTy, 0);
2810       Observer.changedInstr(MI);
2811       return Legalized;
2812     }
2813 
2814     if (TypeIdx == 2) {
2815       Observer.changingInstr(MI);
2816       // TODO: Probably should be zext
2817       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
2818       Observer.changedInstr(MI);
2819       return Legalized;
2820     }
2821 
2822     return UnableToLegalize;
2823   }
2824   case TargetOpcode::G_FADD:
2825   case TargetOpcode::G_FMUL:
2826   case TargetOpcode::G_FSUB:
2827   case TargetOpcode::G_FMA:
2828   case TargetOpcode::G_FMAD:
2829   case TargetOpcode::G_FNEG:
2830   case TargetOpcode::G_FABS:
2831   case TargetOpcode::G_FCANONICALIZE:
2832   case TargetOpcode::G_FMINNUM:
2833   case TargetOpcode::G_FMAXNUM:
2834   case TargetOpcode::G_FMINNUM_IEEE:
2835   case TargetOpcode::G_FMAXNUM_IEEE:
2836   case TargetOpcode::G_FMINIMUM:
2837   case TargetOpcode::G_FMAXIMUM:
2838   case TargetOpcode::G_FDIV:
2839   case TargetOpcode::G_FREM:
2840   case TargetOpcode::G_FCEIL:
2841   case TargetOpcode::G_FFLOOR:
2842   case TargetOpcode::G_FCOS:
2843   case TargetOpcode::G_FSIN:
2844   case TargetOpcode::G_FLOG10:
2845   case TargetOpcode::G_FLOG:
2846   case TargetOpcode::G_FLOG2:
2847   case TargetOpcode::G_FRINT:
2848   case TargetOpcode::G_FNEARBYINT:
2849   case TargetOpcode::G_FSQRT:
2850   case TargetOpcode::G_FEXP:
2851   case TargetOpcode::G_FEXP2:
2852   case TargetOpcode::G_FEXP10:
2853   case TargetOpcode::G_FPOW:
2854   case TargetOpcode::G_INTRINSIC_TRUNC:
2855   case TargetOpcode::G_INTRINSIC_ROUND:
2856   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2857     assert(TypeIdx == 0);
2858     Observer.changingInstr(MI);
2859 
2860     for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I)
2861       widenScalarSrc(MI, WideTy, I, TargetOpcode::G_FPEXT);
2862 
2863     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2864     Observer.changedInstr(MI);
2865     return Legalized;
2866   case TargetOpcode::G_FPOWI:
2867   case TargetOpcode::G_FLDEXP:
2868   case TargetOpcode::G_STRICT_FLDEXP: {
2869     if (TypeIdx == 0) {
2870       if (MI.getOpcode() == TargetOpcode::G_STRICT_FLDEXP)
2871         return UnableToLegalize;
2872 
2873       Observer.changingInstr(MI);
2874       widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
2875       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2876       Observer.changedInstr(MI);
2877       return Legalized;
2878     }
2879 
2880     if (TypeIdx == 1) {
2881       // For some reason SelectionDAG tries to promote to a libcall without
2882       // actually changing the integer type for promotion.
2883       Observer.changingInstr(MI);
2884       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_SEXT);
2885       Observer.changedInstr(MI);
2886       return Legalized;
2887     }
2888 
2889     return UnableToLegalize;
2890   }
2891   case TargetOpcode::G_FFREXP: {
2892     Observer.changingInstr(MI);
2893 
2894     if (TypeIdx == 0) {
2895       widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_FPEXT);
2896       widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2897     } else {
2898       widenScalarDst(MI, WideTy, 1);
2899     }
2900 
2901     Observer.changedInstr(MI);
2902     return Legalized;
2903   }
2904   case TargetOpcode::G_INTTOPTR:
2905     if (TypeIdx != 1)
2906       return UnableToLegalize;
2907 
2908     Observer.changingInstr(MI);
2909     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ZEXT);
2910     Observer.changedInstr(MI);
2911     return Legalized;
2912   case TargetOpcode::G_PTRTOINT:
2913     if (TypeIdx != 0)
2914       return UnableToLegalize;
2915 
2916     Observer.changingInstr(MI);
2917     widenScalarDst(MI, WideTy, 0);
2918     Observer.changedInstr(MI);
2919     return Legalized;
2920   case TargetOpcode::G_BUILD_VECTOR: {
2921     Observer.changingInstr(MI);
2922 
2923     const LLT WideEltTy = TypeIdx == 1 ? WideTy : WideTy.getElementType();
2924     for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
2925       widenScalarSrc(MI, WideEltTy, I, TargetOpcode::G_ANYEXT);
2926 
2927     // Avoid changing the result vector type if the source element type was
2928     // requested.
2929     if (TypeIdx == 1) {
2930       MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BUILD_VECTOR_TRUNC));
2931     } else {
2932       widenScalarDst(MI, WideTy, 0);
2933     }
2934 
2935     Observer.changedInstr(MI);
2936     return Legalized;
2937   }
2938   case TargetOpcode::G_SEXT_INREG:
2939     if (TypeIdx != 0)
2940       return UnableToLegalize;
2941 
2942     Observer.changingInstr(MI);
2943     widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
2944     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
2945     Observer.changedInstr(MI);
2946     return Legalized;
2947   case TargetOpcode::G_PTRMASK: {
2948     if (TypeIdx != 1)
2949       return UnableToLegalize;
2950     Observer.changingInstr(MI);
2951     widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
2952     Observer.changedInstr(MI);
2953     return Legalized;
2954   }
2955   case TargetOpcode::G_VECREDUCE_FADD:
2956   case TargetOpcode::G_VECREDUCE_FMUL:
2957   case TargetOpcode::G_VECREDUCE_FMIN:
2958   case TargetOpcode::G_VECREDUCE_FMAX:
2959   case TargetOpcode::G_VECREDUCE_FMINIMUM:
2960   case TargetOpcode::G_VECREDUCE_FMAXIMUM:
2961     if (TypeIdx != 0)
2962       return UnableToLegalize;
2963     Observer.changingInstr(MI);
2964     Register VecReg = MI.getOperand(1).getReg();
2965     LLT VecTy = MRI.getType(VecReg);
2966     LLT WideVecTy = VecTy.isVector()
2967                         ? LLT::vector(VecTy.getElementCount(), WideTy)
2968                         : WideTy;
2969     widenScalarSrc(MI, WideVecTy, 1, TargetOpcode::G_FPEXT);
2970     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_FPTRUNC);
2971     Observer.changedInstr(MI);
2972     return Legalized;
2973   }
2974 }
2975 
2976 static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
2977                              MachineIRBuilder &B, Register Src, LLT Ty) {
2978   auto Unmerge = B.buildUnmerge(Ty, Src);
2979   for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
2980     Pieces.push_back(Unmerge.getReg(I));
2981 }
2982 
2983 LegalizerHelper::LegalizeResult
2984 LegalizerHelper::lowerFConstant(MachineInstr &MI) {
2985   Register Dst = MI.getOperand(0).getReg();
2986 
2987   MachineFunction &MF = MIRBuilder.getMF();
2988   const DataLayout &DL = MIRBuilder.getDataLayout();
2989 
2990   unsigned AddrSpace = DL.getDefaultGlobalsAddressSpace();
2991   LLT AddrPtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
2992   Align Alignment = Align(DL.getABITypeAlign(
2993       getFloatTypeForLLT(MF.getFunction().getContext(), MRI.getType(Dst))));
2994 
2995   auto Addr = MIRBuilder.buildConstantPool(
2996       AddrPtrTy, MF.getConstantPool()->getConstantPoolIndex(
2997                      MI.getOperand(1).getFPImm(), Alignment));
2998 
2999   MachineMemOperand *MMO = MF.getMachineMemOperand(
3000       MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
3001       MRI.getType(Dst), Alignment);
3002 
3003   MIRBuilder.buildLoadInstr(TargetOpcode::G_LOAD, Dst, Addr, *MMO);
3004   MI.eraseFromParent();
3005 
3006   return Legalized;
3007 }
3008 
3009 LegalizerHelper::LegalizeResult
3010 LegalizerHelper::lowerBitcast(MachineInstr &MI) {
3011   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
3012   if (SrcTy.isVector()) {
3013     LLT SrcEltTy = SrcTy.getElementType();
3014     SmallVector<Register, 8> SrcRegs;
3015 
3016     if (DstTy.isVector()) {
3017       int NumDstElt = DstTy.getNumElements();
3018       int NumSrcElt = SrcTy.getNumElements();
3019 
3020       LLT DstEltTy = DstTy.getElementType();
3021       LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
3022       LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
3023 
3024       // If there's an element size mismatch, insert intermediate casts to match
3025       // the result element type.
3026       if (NumSrcElt < NumDstElt) { // Source element type is larger.
3027         // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
3028         //
3029         // =>
3030         //
3031         // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
3032         // %3:_(<2 x s8>) = G_BITCAST %2
3033         // %4:_(<2 x s8>) = G_BITCAST %3
3034         // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
3035         DstCastTy = LLT::fixed_vector(NumDstElt / NumSrcElt, DstEltTy);
3036         SrcPartTy = SrcEltTy;
3037       } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
3038         //
3039         // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
3040         //
3041         // =>
3042         //
3043         // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
3044         // %3:_(s16) = G_BITCAST %2
3045         // %4:_(s16) = G_BITCAST %3
3046         // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
3047         SrcPartTy = LLT::fixed_vector(NumSrcElt / NumDstElt, SrcEltTy);
3048         DstCastTy = DstEltTy;
3049       }
3050 
3051       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
3052       for (Register &SrcReg : SrcRegs)
3053         SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
3054     } else
3055       getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
3056 
3057     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3058     MI.eraseFromParent();
3059     return Legalized;
3060   }
3061 
3062   if (DstTy.isVector()) {
3063     SmallVector<Register, 8> SrcRegs;
3064     getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
3065     MIRBuilder.buildMergeLikeInstr(Dst, SrcRegs);
3066     MI.eraseFromParent();
3067     return Legalized;
3068   }
3069 
3070   return UnableToLegalize;
3071 }
3072 
3073 /// Figure out the bit offset into a register when coercing a vector index for
3074 /// the wide element type. This is only for the case when promoting vector to
3075 /// one with larger elements.
3076 //
3077 ///
3078 /// %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3079 /// %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3080 static Register getBitcastWiderVectorElementOffset(MachineIRBuilder &B,
3081                                                    Register Idx,
3082                                                    unsigned NewEltSize,
3083                                                    unsigned OldEltSize) {
3084   const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3085   LLT IdxTy = B.getMRI()->getType(Idx);
3086 
3087   // Now figure out the amount we need to shift to get the target bits.
3088   auto OffsetMask = B.buildConstant(
3089       IdxTy, ~(APInt::getAllOnes(IdxTy.getSizeInBits()) << Log2EltRatio));
3090   auto OffsetIdx = B.buildAnd(IdxTy, Idx, OffsetMask);
3091   return B.buildShl(IdxTy, OffsetIdx,
3092                     B.buildConstant(IdxTy, Log2_32(OldEltSize))).getReg(0);
3093 }
3094 
3095 /// Perform a G_EXTRACT_VECTOR_ELT in a different sized vector element. If this
3096 /// is casting to a vector with a smaller element size, perform multiple element
3097 /// extracts and merge the results. If this is coercing to a vector with larger
3098 /// elements, index the bitcasted vector and extract the target element with bit
3099 /// operations. This is intended to force the indexing in the native register
3100 /// size for architectures that can dynamically index the register file.
3101 LegalizerHelper::LegalizeResult
3102 LegalizerHelper::bitcastExtractVectorElt(MachineInstr &MI, unsigned TypeIdx,
3103                                          LLT CastTy) {
3104   if (TypeIdx != 1)
3105     return UnableToLegalize;
3106 
3107   auto [Dst, DstTy, SrcVec, SrcVecTy, Idx, IdxTy] = MI.getFirst3RegLLTs();
3108 
3109   LLT SrcEltTy = SrcVecTy.getElementType();
3110   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3111   unsigned OldNumElts = SrcVecTy.getNumElements();
3112 
3113   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3114   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3115 
3116   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3117   const unsigned OldEltSize = SrcEltTy.getSizeInBits();
3118   if (NewNumElts > OldNumElts) {
3119     // Decreasing the vector element size
3120     //
3121     // e.g. i64 = extract_vector_elt x:v2i64, y:i32
3122     //  =>
3123     //  v4i32:castx = bitcast x:v2i64
3124     //
3125     // i64 = bitcast
3126     //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
3127     //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
3128     //
3129     if (NewNumElts % OldNumElts != 0)
3130       return UnableToLegalize;
3131 
3132     // Type of the intermediate result vector.
3133     const unsigned NewEltsPerOldElt = NewNumElts / OldNumElts;
3134     LLT MidTy =
3135         LLT::scalarOrVector(ElementCount::getFixed(NewEltsPerOldElt), NewEltTy);
3136 
3137     auto NewEltsPerOldEltK = MIRBuilder.buildConstant(IdxTy, NewEltsPerOldElt);
3138 
3139     SmallVector<Register, 8> NewOps(NewEltsPerOldElt);
3140     auto NewBaseIdx = MIRBuilder.buildMul(IdxTy, Idx, NewEltsPerOldEltK);
3141 
3142     for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
3143       auto IdxOffset = MIRBuilder.buildConstant(IdxTy, I);
3144       auto TmpIdx = MIRBuilder.buildAdd(IdxTy, NewBaseIdx, IdxOffset);
3145       auto Elt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec, TmpIdx);
3146       NewOps[I] = Elt.getReg(0);
3147     }
3148 
3149     auto NewVec = MIRBuilder.buildBuildVector(MidTy, NewOps);
3150     MIRBuilder.buildBitcast(Dst, NewVec);
3151     MI.eraseFromParent();
3152     return Legalized;
3153   }
3154 
3155   if (NewNumElts < OldNumElts) {
3156     if (NewEltSize % OldEltSize != 0)
3157       return UnableToLegalize;
3158 
3159     // This only depends on powers of 2 because we use bit tricks to figure out
3160     // the bit offset we need to shift to get the target element. A general
3161     // expansion could emit division/multiply.
3162     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3163       return UnableToLegalize;
3164 
3165     // Increasing the vector element size.
3166     // %elt:_(small_elt) = G_EXTRACT_VECTOR_ELT %vec:_(<N x small_elt>), %idx
3167     //
3168     //   =>
3169     //
3170     // %cast = G_BITCAST %vec
3171     // %scaled_idx = G_LSHR %idx, Log2(DstEltSize / SrcEltSize)
3172     // %wide_elt  = G_EXTRACT_VECTOR_ELT %cast, %scaled_idx
3173     // %offset_idx = G_AND %idx, ~(-1 << Log2(DstEltSize / SrcEltSize))
3174     // %offset_bits = G_SHL %offset_idx, Log2(SrcEltSize)
3175     // %elt_bits = G_LSHR %wide_elt, %offset_bits
3176     // %elt = G_TRUNC %elt_bits
3177 
3178     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3179     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3180 
3181     // Divide to get the index in the wider element type.
3182     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3183 
3184     Register WideElt = CastVec;
3185     if (CastTy.isVector()) {
3186       WideElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3187                                                      ScaledIdx).getReg(0);
3188     }
3189 
3190     // Compute the bit offset into the register of the target element.
3191     Register OffsetBits = getBitcastWiderVectorElementOffset(
3192       MIRBuilder, Idx, NewEltSize, OldEltSize);
3193 
3194     // Shift the wide element to get the target element.
3195     auto ExtractedBits = MIRBuilder.buildLShr(NewEltTy, WideElt, OffsetBits);
3196     MIRBuilder.buildTrunc(Dst, ExtractedBits);
3197     MI.eraseFromParent();
3198     return Legalized;
3199   }
3200 
3201   return UnableToLegalize;
3202 }
3203 
3204 /// Emit code to insert \p InsertReg into \p TargetRet at \p OffsetBits in \p
3205 /// TargetReg, while preserving other bits in \p TargetReg.
3206 ///
3207 /// (InsertReg << Offset) | (TargetReg & ~(-1 >> InsertReg.size()) << Offset)
3208 static Register buildBitFieldInsert(MachineIRBuilder &B,
3209                                     Register TargetReg, Register InsertReg,
3210                                     Register OffsetBits) {
3211   LLT TargetTy = B.getMRI()->getType(TargetReg);
3212   LLT InsertTy = B.getMRI()->getType(InsertReg);
3213   auto ZextVal = B.buildZExt(TargetTy, InsertReg);
3214   auto ShiftedInsertVal = B.buildShl(TargetTy, ZextVal, OffsetBits);
3215 
3216   // Produce a bitmask of the value to insert
3217   auto EltMask = B.buildConstant(
3218     TargetTy, APInt::getLowBitsSet(TargetTy.getSizeInBits(),
3219                                    InsertTy.getSizeInBits()));
3220   // Shift it into position
3221   auto ShiftedMask = B.buildShl(TargetTy, EltMask, OffsetBits);
3222   auto InvShiftedMask = B.buildNot(TargetTy, ShiftedMask);
3223 
3224   // Clear out the bits in the wide element
3225   auto MaskedOldElt = B.buildAnd(TargetTy, TargetReg, InvShiftedMask);
3226 
3227   // The value to insert has all zeros already, so stick it into the masked
3228   // wide element.
3229   return B.buildOr(TargetTy, MaskedOldElt, ShiftedInsertVal).getReg(0);
3230 }
3231 
3232 /// Perform a G_INSERT_VECTOR_ELT in a different sized vector element. If this
3233 /// is increasing the element size, perform the indexing in the target element
3234 /// type, and use bit operations to insert at the element position. This is
3235 /// intended for architectures that can dynamically index the register file and
3236 /// want to force indexing in the native register size.
3237 LegalizerHelper::LegalizeResult
3238 LegalizerHelper::bitcastInsertVectorElt(MachineInstr &MI, unsigned TypeIdx,
3239                                         LLT CastTy) {
3240   if (TypeIdx != 0)
3241     return UnableToLegalize;
3242 
3243   auto [Dst, DstTy, SrcVec, SrcVecTy, Val, ValTy, Idx, IdxTy] =
3244       MI.getFirst4RegLLTs();
3245   LLT VecTy = DstTy;
3246 
3247   LLT VecEltTy = VecTy.getElementType();
3248   LLT NewEltTy = CastTy.isVector() ? CastTy.getElementType() : CastTy;
3249   const unsigned NewEltSize = NewEltTy.getSizeInBits();
3250   const unsigned OldEltSize = VecEltTy.getSizeInBits();
3251 
3252   unsigned NewNumElts = CastTy.isVector() ? CastTy.getNumElements() : 1;
3253   unsigned OldNumElts = VecTy.getNumElements();
3254 
3255   Register CastVec = MIRBuilder.buildBitcast(CastTy, SrcVec).getReg(0);
3256   if (NewNumElts < OldNumElts) {
3257     if (NewEltSize % OldEltSize != 0)
3258       return UnableToLegalize;
3259 
3260     // This only depends on powers of 2 because we use bit tricks to figure out
3261     // the bit offset we need to shift to get the target element. A general
3262     // expansion could emit division/multiply.
3263     if (!isPowerOf2_32(NewEltSize / OldEltSize))
3264       return UnableToLegalize;
3265 
3266     const unsigned Log2EltRatio = Log2_32(NewEltSize / OldEltSize);
3267     auto Log2Ratio = MIRBuilder.buildConstant(IdxTy, Log2EltRatio);
3268 
3269     // Divide to get the index in the wider element type.
3270     auto ScaledIdx = MIRBuilder.buildLShr(IdxTy, Idx, Log2Ratio);
3271 
3272     Register ExtractedElt = CastVec;
3273     if (CastTy.isVector()) {
3274       ExtractedElt = MIRBuilder.buildExtractVectorElement(NewEltTy, CastVec,
3275                                                           ScaledIdx).getReg(0);
3276     }
3277 
3278     // Compute the bit offset into the register of the target element.
3279     Register OffsetBits = getBitcastWiderVectorElementOffset(
3280       MIRBuilder, Idx, NewEltSize, OldEltSize);
3281 
3282     Register InsertedElt = buildBitFieldInsert(MIRBuilder, ExtractedElt,
3283                                                Val, OffsetBits);
3284     if (CastTy.isVector()) {
3285       InsertedElt = MIRBuilder.buildInsertVectorElement(
3286         CastTy, CastVec, InsertedElt, ScaledIdx).getReg(0);
3287     }
3288 
3289     MIRBuilder.buildBitcast(Dst, InsertedElt);
3290     MI.eraseFromParent();
3291     return Legalized;
3292   }
3293 
3294   return UnableToLegalize;
3295 }
3296 
3297 LegalizerHelper::LegalizeResult LegalizerHelper::lowerLoad(GAnyLoad &LoadMI) {
3298   // Lower to a memory-width G_LOAD and a G_SEXT/G_ZEXT/G_ANYEXT
3299   Register DstReg = LoadMI.getDstReg();
3300   Register PtrReg = LoadMI.getPointerReg();
3301   LLT DstTy = MRI.getType(DstReg);
3302   MachineMemOperand &MMO = LoadMI.getMMO();
3303   LLT MemTy = MMO.getMemoryType();
3304   MachineFunction &MF = MIRBuilder.getMF();
3305 
3306   unsigned MemSizeInBits = MemTy.getSizeInBits();
3307   unsigned MemStoreSizeInBits = 8 * MemTy.getSizeInBytes();
3308 
3309   if (MemSizeInBits != MemStoreSizeInBits) {
3310     if (MemTy.isVector())
3311       return UnableToLegalize;
3312 
3313     // Promote to a byte-sized load if not loading an integral number of
3314     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
3315     LLT WideMemTy = LLT::scalar(MemStoreSizeInBits);
3316     MachineMemOperand *NewMMO =
3317         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideMemTy);
3318 
3319     Register LoadReg = DstReg;
3320     LLT LoadTy = DstTy;
3321 
3322     // If this wasn't already an extending load, we need to widen the result
3323     // register to avoid creating a load with a narrower result than the source.
3324     if (MemStoreSizeInBits > DstTy.getSizeInBits()) {
3325       LoadTy = WideMemTy;
3326       LoadReg = MRI.createGenericVirtualRegister(WideMemTy);
3327     }
3328 
3329     if (isa<GSExtLoad>(LoadMI)) {
3330       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3331       MIRBuilder.buildSExtInReg(LoadReg, NewLoad, MemSizeInBits);
3332     } else if (isa<GZExtLoad>(LoadMI) || WideMemTy == LoadTy) {
3333       auto NewLoad = MIRBuilder.buildLoad(LoadTy, PtrReg, *NewMMO);
3334       // The extra bits are guaranteed to be zero, since we stored them that
3335       // way.  A zext load from Wide thus automatically gives zext from MemVT.
3336       MIRBuilder.buildAssertZExt(LoadReg, NewLoad, MemSizeInBits);
3337     } else {
3338       MIRBuilder.buildLoad(LoadReg, PtrReg, *NewMMO);
3339     }
3340 
3341     if (DstTy != LoadTy)
3342       MIRBuilder.buildTrunc(DstReg, LoadReg);
3343 
3344     LoadMI.eraseFromParent();
3345     return Legalized;
3346   }
3347 
3348   // Big endian lowering not implemented.
3349   if (MIRBuilder.getDataLayout().isBigEndian())
3350     return UnableToLegalize;
3351 
3352   // This load needs splitting into power of 2 sized loads.
3353   //
3354   // Our strategy here is to generate anyextending loads for the smaller
3355   // types up to next power-2 result type, and then combine the two larger
3356   // result values together, before truncating back down to the non-pow-2
3357   // type.
3358   // E.g. v1 = i24 load =>
3359   // v2 = i32 zextload (2 byte)
3360   // v3 = i32 load (1 byte)
3361   // v4 = i32 shl v3, 16
3362   // v5 = i32 or v4, v2
3363   // v1 = i24 trunc v5
3364   // By doing this we generate the correct truncate which should get
3365   // combined away as an artifact with a matching extend.
3366 
3367   uint64_t LargeSplitSize, SmallSplitSize;
3368 
3369   if (!isPowerOf2_32(MemSizeInBits)) {
3370     // This load needs splitting into power of 2 sized loads.
3371     LargeSplitSize = llvm::bit_floor(MemSizeInBits);
3372     SmallSplitSize = MemSizeInBits - LargeSplitSize;
3373   } else {
3374     // This is already a power of 2, but we still need to split this in half.
3375     //
3376     // Assume we're being asked to decompose an unaligned load.
3377     // TODO: If this requires multiple splits, handle them all at once.
3378     auto &Ctx = MF.getFunction().getContext();
3379     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3380       return UnableToLegalize;
3381 
3382     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3383   }
3384 
3385   if (MemTy.isVector()) {
3386     // TODO: Handle vector extloads
3387     if (MemTy != DstTy)
3388       return UnableToLegalize;
3389 
3390     // TODO: We can do better than scalarizing the vector and at least split it
3391     // in half.
3392     return reduceLoadStoreWidth(LoadMI, 0, DstTy.getElementType());
3393   }
3394 
3395   MachineMemOperand *LargeMMO =
3396       MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3397   MachineMemOperand *SmallMMO =
3398       MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3399 
3400   LLT PtrTy = MRI.getType(PtrReg);
3401   unsigned AnyExtSize = PowerOf2Ceil(DstTy.getSizeInBits());
3402   LLT AnyExtTy = LLT::scalar(AnyExtSize);
3403   auto LargeLoad = MIRBuilder.buildLoadInstr(TargetOpcode::G_ZEXTLOAD, AnyExtTy,
3404                                              PtrReg, *LargeMMO);
3405 
3406   auto OffsetCst = MIRBuilder.buildConstant(LLT::scalar(PtrTy.getSizeInBits()),
3407                                             LargeSplitSize / 8);
3408   Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
3409   auto SmallPtr = MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst);
3410   auto SmallLoad = MIRBuilder.buildLoadInstr(LoadMI.getOpcode(), AnyExtTy,
3411                                              SmallPtr, *SmallMMO);
3412 
3413   auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
3414   auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
3415 
3416   if (AnyExtTy == DstTy)
3417     MIRBuilder.buildOr(DstReg, Shift, LargeLoad);
3418   else if (AnyExtTy.getSizeInBits() != DstTy.getSizeInBits()) {
3419     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3420     MIRBuilder.buildTrunc(DstReg, {Or});
3421   } else {
3422     assert(DstTy.isPointer() && "expected pointer");
3423     auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
3424 
3425     // FIXME: We currently consider this to be illegal for non-integral address
3426     // spaces, but we need still need a way to reinterpret the bits.
3427     MIRBuilder.buildIntToPtr(DstReg, Or);
3428   }
3429 
3430   LoadMI.eraseFromParent();
3431   return Legalized;
3432 }
3433 
3434 LegalizerHelper::LegalizeResult LegalizerHelper::lowerStore(GStore &StoreMI) {
3435   // Lower a non-power of 2 store into multiple pow-2 stores.
3436   // E.g. split an i24 store into an i16 store + i8 store.
3437   // We do this by first extending the stored value to the next largest power
3438   // of 2 type, and then using truncating stores to store the components.
3439   // By doing this, likewise with G_LOAD, generate an extend that can be
3440   // artifact-combined away instead of leaving behind extracts.
3441   Register SrcReg = StoreMI.getValueReg();
3442   Register PtrReg = StoreMI.getPointerReg();
3443   LLT SrcTy = MRI.getType(SrcReg);
3444   MachineFunction &MF = MIRBuilder.getMF();
3445   MachineMemOperand &MMO = **StoreMI.memoperands_begin();
3446   LLT MemTy = MMO.getMemoryType();
3447 
3448   unsigned StoreWidth = MemTy.getSizeInBits();
3449   unsigned StoreSizeInBits = 8 * MemTy.getSizeInBytes();
3450 
3451   if (StoreWidth != StoreSizeInBits) {
3452     if (SrcTy.isVector())
3453       return UnableToLegalize;
3454 
3455     // Promote to a byte-sized store with upper bits zero if not
3456     // storing an integral number of bytes.  For example, promote
3457     // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
3458     LLT WideTy = LLT::scalar(StoreSizeInBits);
3459 
3460     if (StoreSizeInBits > SrcTy.getSizeInBits()) {
3461       // Avoid creating a store with a narrower source than result.
3462       SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
3463       SrcTy = WideTy;
3464     }
3465 
3466     auto ZextInReg = MIRBuilder.buildZExtInReg(SrcTy, SrcReg, StoreWidth);
3467 
3468     MachineMemOperand *NewMMO =
3469         MF.getMachineMemOperand(&MMO, MMO.getPointerInfo(), WideTy);
3470     MIRBuilder.buildStore(ZextInReg, PtrReg, *NewMMO);
3471     StoreMI.eraseFromParent();
3472     return Legalized;
3473   }
3474 
3475   if (MemTy.isVector()) {
3476     // TODO: Handle vector trunc stores
3477     if (MemTy != SrcTy)
3478       return UnableToLegalize;
3479 
3480     // TODO: We can do better than scalarizing the vector and at least split it
3481     // in half.
3482     return reduceLoadStoreWidth(StoreMI, 0, SrcTy.getElementType());
3483   }
3484 
3485   unsigned MemSizeInBits = MemTy.getSizeInBits();
3486   uint64_t LargeSplitSize, SmallSplitSize;
3487 
3488   if (!isPowerOf2_32(MemSizeInBits)) {
3489     LargeSplitSize = llvm::bit_floor<uint64_t>(MemTy.getSizeInBits());
3490     SmallSplitSize = MemTy.getSizeInBits() - LargeSplitSize;
3491   } else {
3492     auto &Ctx = MF.getFunction().getContext();
3493     if (TLI.allowsMemoryAccess(Ctx, MIRBuilder.getDataLayout(), MemTy, MMO))
3494       return UnableToLegalize; // Don't know what we're being asked to do.
3495 
3496     SmallSplitSize = LargeSplitSize = MemSizeInBits / 2;
3497   }
3498 
3499   // Extend to the next pow-2. If this store was itself the result of lowering,
3500   // e.g. an s56 store being broken into s32 + s24, we might have a stored type
3501   // that's wider than the stored size.
3502   unsigned AnyExtSize = PowerOf2Ceil(MemTy.getSizeInBits());
3503   const LLT NewSrcTy = LLT::scalar(AnyExtSize);
3504 
3505   if (SrcTy.isPointer()) {
3506     const LLT IntPtrTy = LLT::scalar(SrcTy.getSizeInBits());
3507     SrcReg = MIRBuilder.buildPtrToInt(IntPtrTy, SrcReg).getReg(0);
3508   }
3509 
3510   auto ExtVal = MIRBuilder.buildAnyExtOrTrunc(NewSrcTy, SrcReg);
3511 
3512   // Obtain the smaller value by shifting away the larger value.
3513   auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, LargeSplitSize);
3514   auto SmallVal = MIRBuilder.buildLShr(NewSrcTy, ExtVal, ShiftAmt);
3515 
3516   // Generate the PtrAdd and truncating stores.
3517   LLT PtrTy = MRI.getType(PtrReg);
3518   auto OffsetCst = MIRBuilder.buildConstant(
3519     LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
3520   auto SmallPtr =
3521     MIRBuilder.buildPtrAdd(PtrTy, PtrReg, OffsetCst);
3522 
3523   MachineMemOperand *LargeMMO =
3524     MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
3525   MachineMemOperand *SmallMMO =
3526     MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
3527   MIRBuilder.buildStore(ExtVal, PtrReg, *LargeMMO);
3528   MIRBuilder.buildStore(SmallVal, SmallPtr, *SmallMMO);
3529   StoreMI.eraseFromParent();
3530   return Legalized;
3531 }
3532 
3533 LegalizerHelper::LegalizeResult
3534 LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
3535   switch (MI.getOpcode()) {
3536   case TargetOpcode::G_LOAD: {
3537     if (TypeIdx != 0)
3538       return UnableToLegalize;
3539     MachineMemOperand &MMO = **MI.memoperands_begin();
3540 
3541     // Not sure how to interpret a bitcast of an extending load.
3542     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3543       return UnableToLegalize;
3544 
3545     Observer.changingInstr(MI);
3546     bitcastDst(MI, CastTy, 0);
3547     MMO.setType(CastTy);
3548     Observer.changedInstr(MI);
3549     return Legalized;
3550   }
3551   case TargetOpcode::G_STORE: {
3552     if (TypeIdx != 0)
3553       return UnableToLegalize;
3554 
3555     MachineMemOperand &MMO = **MI.memoperands_begin();
3556 
3557     // Not sure how to interpret a bitcast of a truncating store.
3558     if (MMO.getMemoryType().getSizeInBits() != CastTy.getSizeInBits())
3559       return UnableToLegalize;
3560 
3561     Observer.changingInstr(MI);
3562     bitcastSrc(MI, CastTy, 0);
3563     MMO.setType(CastTy);
3564     Observer.changedInstr(MI);
3565     return Legalized;
3566   }
3567   case TargetOpcode::G_SELECT: {
3568     if (TypeIdx != 0)
3569       return UnableToLegalize;
3570 
3571     if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
3572       LLVM_DEBUG(
3573           dbgs() << "bitcast action not implemented for vector select\n");
3574       return UnableToLegalize;
3575     }
3576 
3577     Observer.changingInstr(MI);
3578     bitcastSrc(MI, CastTy, 2);
3579     bitcastSrc(MI, CastTy, 3);
3580     bitcastDst(MI, CastTy, 0);
3581     Observer.changedInstr(MI);
3582     return Legalized;
3583   }
3584   case TargetOpcode::G_AND:
3585   case TargetOpcode::G_OR:
3586   case TargetOpcode::G_XOR: {
3587     Observer.changingInstr(MI);
3588     bitcastSrc(MI, CastTy, 1);
3589     bitcastSrc(MI, CastTy, 2);
3590     bitcastDst(MI, CastTy, 0);
3591     Observer.changedInstr(MI);
3592     return Legalized;
3593   }
3594   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3595     return bitcastExtractVectorElt(MI, TypeIdx, CastTy);
3596   case TargetOpcode::G_INSERT_VECTOR_ELT:
3597     return bitcastInsertVectorElt(MI, TypeIdx, CastTy);
3598   default:
3599     return UnableToLegalize;
3600   }
3601 }
3602 
3603 // Legalize an instruction by changing the opcode in place.
3604 void LegalizerHelper::changeOpcode(MachineInstr &MI, unsigned NewOpcode) {
3605     Observer.changingInstr(MI);
3606     MI.setDesc(MIRBuilder.getTII().get(NewOpcode));
3607     Observer.changedInstr(MI);
3608 }
3609 
3610 LegalizerHelper::LegalizeResult
3611 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
3612   using namespace TargetOpcode;
3613 
3614   switch(MI.getOpcode()) {
3615   default:
3616     return UnableToLegalize;
3617   case TargetOpcode::G_FCONSTANT:
3618     return lowerFConstant(MI);
3619   case TargetOpcode::G_BITCAST:
3620     return lowerBitcast(MI);
3621   case TargetOpcode::G_SREM:
3622   case TargetOpcode::G_UREM: {
3623     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3624     auto Quot =
3625         MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
3626                               {MI.getOperand(1), MI.getOperand(2)});
3627 
3628     auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
3629     MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
3630     MI.eraseFromParent();
3631     return Legalized;
3632   }
3633   case TargetOpcode::G_SADDO:
3634   case TargetOpcode::G_SSUBO:
3635     return lowerSADDO_SSUBO(MI);
3636   case TargetOpcode::G_UMULH:
3637   case TargetOpcode::G_SMULH:
3638     return lowerSMULH_UMULH(MI);
3639   case TargetOpcode::G_SMULO:
3640   case TargetOpcode::G_UMULO: {
3641     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
3642     // result.
3643     auto [Res, Overflow, LHS, RHS] = MI.getFirst4Regs();
3644     LLT Ty = MRI.getType(Res);
3645 
3646     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
3647                           ? TargetOpcode::G_SMULH
3648                           : TargetOpcode::G_UMULH;
3649 
3650     Observer.changingInstr(MI);
3651     const auto &TII = MIRBuilder.getTII();
3652     MI.setDesc(TII.get(TargetOpcode::G_MUL));
3653     MI.removeOperand(1);
3654     Observer.changedInstr(MI);
3655 
3656     auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
3657     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3658 
3659     // Move insert point forward so we can use the Res register if needed.
3660     MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
3661 
3662     // For *signed* multiply, overflow is detected by checking:
3663     // (hi != (lo >> bitwidth-1))
3664     if (Opcode == TargetOpcode::G_SMULH) {
3665       auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
3666       auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
3667       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
3668     } else {
3669       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
3670     }
3671     return Legalized;
3672   }
3673   case TargetOpcode::G_FNEG: {
3674     auto [Res, SubByReg] = MI.getFirst2Regs();
3675     LLT Ty = MRI.getType(Res);
3676 
3677     // TODO: Handle vector types once we are able to
3678     // represent them.
3679     if (Ty.isVector())
3680       return UnableToLegalize;
3681     auto SignMask =
3682         MIRBuilder.buildConstant(Ty, APInt::getSignMask(Ty.getSizeInBits()));
3683     MIRBuilder.buildXor(Res, SubByReg, SignMask);
3684     MI.eraseFromParent();
3685     return Legalized;
3686   }
3687   case TargetOpcode::G_FSUB:
3688   case TargetOpcode::G_STRICT_FSUB: {
3689     auto [Res, LHS, RHS] = MI.getFirst3Regs();
3690     LLT Ty = MRI.getType(Res);
3691 
3692     // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
3693     auto Neg = MIRBuilder.buildFNeg(Ty, RHS);
3694 
3695     if (MI.getOpcode() == TargetOpcode::G_STRICT_FSUB)
3696       MIRBuilder.buildStrictFAdd(Res, LHS, Neg, MI.getFlags());
3697     else
3698       MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
3699 
3700     MI.eraseFromParent();
3701     return Legalized;
3702   }
3703   case TargetOpcode::G_FMAD:
3704     return lowerFMad(MI);
3705   case TargetOpcode::G_FFLOOR:
3706     return lowerFFloor(MI);
3707   case TargetOpcode::G_INTRINSIC_ROUND:
3708     return lowerIntrinsicRound(MI);
3709   case TargetOpcode::G_FRINT: {
3710     // Since round even is the assumed rounding mode for unconstrained FP
3711     // operations, rint and roundeven are the same operation.
3712     changeOpcode(MI, TargetOpcode::G_INTRINSIC_ROUNDEVEN);
3713     return Legalized;
3714   }
3715   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
3716     auto [OldValRes, SuccessRes, Addr, CmpVal, NewVal] = MI.getFirst5Regs();
3717     MIRBuilder.buildAtomicCmpXchg(OldValRes, Addr, CmpVal, NewVal,
3718                                   **MI.memoperands_begin());
3719     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, SuccessRes, OldValRes, CmpVal);
3720     MI.eraseFromParent();
3721     return Legalized;
3722   }
3723   case TargetOpcode::G_LOAD:
3724   case TargetOpcode::G_SEXTLOAD:
3725   case TargetOpcode::G_ZEXTLOAD:
3726     return lowerLoad(cast<GAnyLoad>(MI));
3727   case TargetOpcode::G_STORE:
3728     return lowerStore(cast<GStore>(MI));
3729   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
3730   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
3731   case TargetOpcode::G_CTLZ:
3732   case TargetOpcode::G_CTTZ:
3733   case TargetOpcode::G_CTPOP:
3734     return lowerBitCount(MI);
3735   case G_UADDO: {
3736     auto [Res, CarryOut, LHS, RHS] = MI.getFirst4Regs();
3737 
3738     MIRBuilder.buildAdd(Res, LHS, RHS);
3739     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, RHS);
3740 
3741     MI.eraseFromParent();
3742     return Legalized;
3743   }
3744   case G_UADDE: {
3745     auto [Res, CarryOut, LHS, RHS, CarryIn] = MI.getFirst5Regs();
3746     const LLT CondTy = MRI.getType(CarryOut);
3747     const LLT Ty = MRI.getType(Res);
3748 
3749     // Initial add of the two operands.
3750     auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
3751 
3752     // Initial check for carry.
3753     auto Carry = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, TmpRes, LHS);
3754 
3755     // Add the sum and the carry.
3756     auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
3757     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
3758 
3759     // Second check for carry. We can only carry if the initial sum is all 1s
3760     // and the carry is set, resulting in a new sum of 0.
3761     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3762     auto ResEqZero = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, Res, Zero);
3763     auto Carry2 = MIRBuilder.buildAnd(CondTy, ResEqZero, CarryIn);
3764     MIRBuilder.buildOr(CarryOut, Carry, Carry2);
3765 
3766     MI.eraseFromParent();
3767     return Legalized;
3768   }
3769   case G_USUBO: {
3770     auto [Res, BorrowOut, LHS, RHS] = MI.getFirst4Regs();
3771 
3772     MIRBuilder.buildSub(Res, LHS, RHS);
3773     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, BorrowOut, LHS, RHS);
3774 
3775     MI.eraseFromParent();
3776     return Legalized;
3777   }
3778   case G_USUBE: {
3779     auto [Res, BorrowOut, LHS, RHS, BorrowIn] = MI.getFirst5Regs();
3780     const LLT CondTy = MRI.getType(BorrowOut);
3781     const LLT Ty = MRI.getType(Res);
3782 
3783     // Initial subtract of the two operands.
3784     auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
3785 
3786     // Initial check for borrow.
3787     auto Borrow = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, CondTy, TmpRes, LHS);
3788 
3789     // Subtract the borrow from the first subtract.
3790     auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
3791     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
3792 
3793     // Second check for borrow. We can only borrow if the initial difference is
3794     // 0 and the borrow is set, resulting in a new difference of all 1s.
3795     auto Zero = MIRBuilder.buildConstant(Ty, 0);
3796     auto TmpResEqZero =
3797         MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, TmpRes, Zero);
3798     auto Borrow2 = MIRBuilder.buildAnd(CondTy, TmpResEqZero, BorrowIn);
3799     MIRBuilder.buildOr(BorrowOut, Borrow, Borrow2);
3800 
3801     MI.eraseFromParent();
3802     return Legalized;
3803   }
3804   case G_UITOFP:
3805     return lowerUITOFP(MI);
3806   case G_SITOFP:
3807     return lowerSITOFP(MI);
3808   case G_FPTOUI:
3809     return lowerFPTOUI(MI);
3810   case G_FPTOSI:
3811     return lowerFPTOSI(MI);
3812   case G_FPTRUNC:
3813     return lowerFPTRUNC(MI);
3814   case G_FPOWI:
3815     return lowerFPOWI(MI);
3816   case G_SMIN:
3817   case G_SMAX:
3818   case G_UMIN:
3819   case G_UMAX:
3820     return lowerMinMax(MI);
3821   case G_FCOPYSIGN:
3822     return lowerFCopySign(MI);
3823   case G_FMINNUM:
3824   case G_FMAXNUM:
3825     return lowerFMinNumMaxNum(MI);
3826   case G_MERGE_VALUES:
3827     return lowerMergeValues(MI);
3828   case G_UNMERGE_VALUES:
3829     return lowerUnmergeValues(MI);
3830   case TargetOpcode::G_SEXT_INREG: {
3831     assert(MI.getOperand(2).isImm() && "Expected immediate");
3832     int64_t SizeInBits = MI.getOperand(2).getImm();
3833 
3834     auto [DstReg, SrcReg] = MI.getFirst2Regs();
3835     LLT DstTy = MRI.getType(DstReg);
3836     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
3837 
3838     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
3839     MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
3840     MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
3841     MI.eraseFromParent();
3842     return Legalized;
3843   }
3844   case G_EXTRACT_VECTOR_ELT:
3845   case G_INSERT_VECTOR_ELT:
3846     return lowerExtractInsertVectorElt(MI);
3847   case G_SHUFFLE_VECTOR:
3848     return lowerShuffleVector(MI);
3849   case G_DYN_STACKALLOC:
3850     return lowerDynStackAlloc(MI);
3851   case G_STACKSAVE:
3852     return lowerStackSave(MI);
3853   case G_STACKRESTORE:
3854     return lowerStackRestore(MI);
3855   case G_EXTRACT:
3856     return lowerExtract(MI);
3857   case G_INSERT:
3858     return lowerInsert(MI);
3859   case G_BSWAP:
3860     return lowerBswap(MI);
3861   case G_BITREVERSE:
3862     return lowerBitreverse(MI);
3863   case G_READ_REGISTER:
3864   case G_WRITE_REGISTER:
3865     return lowerReadWriteRegister(MI);
3866   case G_UADDSAT:
3867   case G_USUBSAT: {
3868     // Try to make a reasonable guess about which lowering strategy to use. The
3869     // target can override this with custom lowering and calling the
3870     // implementation functions.
3871     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3872     if (LI.isLegalOrCustom({G_UMIN, Ty}))
3873       return lowerAddSubSatToMinMax(MI);
3874     return lowerAddSubSatToAddoSubo(MI);
3875   }
3876   case G_SADDSAT:
3877   case G_SSUBSAT: {
3878     LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3879 
3880     // FIXME: It would probably make more sense to see if G_SADDO is preferred,
3881     // since it's a shorter expansion. However, we would need to figure out the
3882     // preferred boolean type for the carry out for the query.
3883     if (LI.isLegalOrCustom({G_SMIN, Ty}) && LI.isLegalOrCustom({G_SMAX, Ty}))
3884       return lowerAddSubSatToMinMax(MI);
3885     return lowerAddSubSatToAddoSubo(MI);
3886   }
3887   case G_SSHLSAT:
3888   case G_USHLSAT:
3889     return lowerShlSat(MI);
3890   case G_ABS:
3891     return lowerAbsToAddXor(MI);
3892   case G_SELECT:
3893     return lowerSelect(MI);
3894   case G_IS_FPCLASS:
3895     return lowerISFPCLASS(MI);
3896   case G_SDIVREM:
3897   case G_UDIVREM:
3898     return lowerDIVREM(MI);
3899   case G_FSHL:
3900   case G_FSHR:
3901     return lowerFunnelShift(MI);
3902   case G_ROTL:
3903   case G_ROTR:
3904     return lowerRotate(MI);
3905   case G_MEMSET:
3906   case G_MEMCPY:
3907   case G_MEMMOVE:
3908     return lowerMemCpyFamily(MI);
3909   case G_MEMCPY_INLINE:
3910     return lowerMemcpyInline(MI);
3911   case G_ZEXT:
3912   case G_SEXT:
3913   case G_ANYEXT:
3914     return lowerEXT(MI);
3915   case G_TRUNC:
3916     return lowerTRUNC(MI);
3917   GISEL_VECREDUCE_CASES_NONSEQ
3918     return lowerVectorReduction(MI);
3919   case G_VAARG:
3920     return lowerVAArg(MI);
3921   }
3922 }
3923 
3924 Align LegalizerHelper::getStackTemporaryAlignment(LLT Ty,
3925                                                   Align MinAlign) const {
3926   // FIXME: We're missing a way to go back from LLT to llvm::Type to query the
3927   // datalayout for the preferred alignment. Also there should be a target hook
3928   // for this to allow targets to reduce the alignment and ignore the
3929   // datalayout. e.g. AMDGPU should always use a 4-byte alignment, regardless of
3930   // the type.
3931   return std::max(Align(PowerOf2Ceil(Ty.getSizeInBytes())), MinAlign);
3932 }
3933 
3934 MachineInstrBuilder
3935 LegalizerHelper::createStackTemporary(TypeSize Bytes, Align Alignment,
3936                                       MachinePointerInfo &PtrInfo) {
3937   MachineFunction &MF = MIRBuilder.getMF();
3938   const DataLayout &DL = MIRBuilder.getDataLayout();
3939   int FrameIdx = MF.getFrameInfo().CreateStackObject(Bytes, Alignment, false);
3940 
3941   unsigned AddrSpace = DL.getAllocaAddrSpace();
3942   LLT FramePtrTy = LLT::pointer(AddrSpace, DL.getPointerSizeInBits(AddrSpace));
3943 
3944   PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIdx);
3945   return MIRBuilder.buildFrameIndex(FramePtrTy, FrameIdx);
3946 }
3947 
3948 static Register clampDynamicVectorIndex(MachineIRBuilder &B, Register IdxReg,
3949                                         LLT VecTy) {
3950   int64_t IdxVal;
3951   if (mi_match(IdxReg, *B.getMRI(), m_ICst(IdxVal)))
3952     return IdxReg;
3953 
3954   LLT IdxTy = B.getMRI()->getType(IdxReg);
3955   unsigned NElts = VecTy.getNumElements();
3956   if (isPowerOf2_32(NElts)) {
3957     APInt Imm = APInt::getLowBitsSet(IdxTy.getSizeInBits(), Log2_32(NElts));
3958     return B.buildAnd(IdxTy, IdxReg, B.buildConstant(IdxTy, Imm)).getReg(0);
3959   }
3960 
3961   return B.buildUMin(IdxTy, IdxReg, B.buildConstant(IdxTy, NElts - 1))
3962       .getReg(0);
3963 }
3964 
3965 Register LegalizerHelper::getVectorElementPointer(Register VecPtr, LLT VecTy,
3966                                                   Register Index) {
3967   LLT EltTy = VecTy.getElementType();
3968 
3969   // Calculate the element offset and add it to the pointer.
3970   unsigned EltSize = EltTy.getSizeInBits() / 8; // FIXME: should be ABI size.
3971   assert(EltSize * 8 == EltTy.getSizeInBits() &&
3972          "Converting bits to bytes lost precision");
3973 
3974   Index = clampDynamicVectorIndex(MIRBuilder, Index, VecTy);
3975 
3976   LLT IdxTy = MRI.getType(Index);
3977   auto Mul = MIRBuilder.buildMul(IdxTy, Index,
3978                                  MIRBuilder.buildConstant(IdxTy, EltSize));
3979 
3980   LLT PtrTy = MRI.getType(VecPtr);
3981   return MIRBuilder.buildPtrAdd(PtrTy, VecPtr, Mul).getReg(0);
3982 }
3983 
3984 #ifndef NDEBUG
3985 /// Check that all vector operands have same number of elements. Other operands
3986 /// should be listed in NonVecOp.
3987 static bool hasSameNumEltsOnAllVectorOperands(
3988     GenericMachineInstr &MI, MachineRegisterInfo &MRI,
3989     std::initializer_list<unsigned> NonVecOpIndices) {
3990   if (MI.getNumMemOperands() != 0)
3991     return false;
3992 
3993   LLT VecTy = MRI.getType(MI.getReg(0));
3994   if (!VecTy.isVector())
3995     return false;
3996   unsigned NumElts = VecTy.getNumElements();
3997 
3998   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
3999     MachineOperand &Op = MI.getOperand(OpIdx);
4000     if (!Op.isReg()) {
4001       if (!is_contained(NonVecOpIndices, OpIdx))
4002         return false;
4003       continue;
4004     }
4005 
4006     LLT Ty = MRI.getType(Op.getReg());
4007     if (!Ty.isVector()) {
4008       if (!is_contained(NonVecOpIndices, OpIdx))
4009         return false;
4010       continue;
4011     }
4012 
4013     if (Ty.getNumElements() != NumElts)
4014       return false;
4015   }
4016 
4017   return true;
4018 }
4019 #endif
4020 
4021 /// Fill \p DstOps with DstOps that have same number of elements combined as
4022 /// the Ty. These DstOps have either scalar type when \p NumElts = 1 or are
4023 /// vectors with \p NumElts elements. When Ty.getNumElements() is not multiple
4024 /// of \p NumElts last DstOp (leftover) has fewer then \p NumElts elements.
4025 static void makeDstOps(SmallVectorImpl<DstOp> &DstOps, LLT Ty,
4026                        unsigned NumElts) {
4027   LLT LeftoverTy;
4028   assert(Ty.isVector() && "Expected vector type");
4029   LLT EltTy = Ty.getElementType();
4030   LLT NarrowTy = (NumElts == 1) ? EltTy : LLT::fixed_vector(NumElts, EltTy);
4031   int NumParts, NumLeftover;
4032   std::tie(NumParts, NumLeftover) =
4033       getNarrowTypeBreakDown(Ty, NarrowTy, LeftoverTy);
4034 
4035   assert(NumParts > 0 && "Error in getNarrowTypeBreakDown");
4036   for (int i = 0; i < NumParts; ++i) {
4037     DstOps.push_back(NarrowTy);
4038   }
4039 
4040   if (LeftoverTy.isValid()) {
4041     assert(NumLeftover == 1 && "expected exactly one leftover");
4042     DstOps.push_back(LeftoverTy);
4043   }
4044 }
4045 
4046 /// Operand \p Op is used on \p N sub-instructions. Fill \p Ops with \p N SrcOps
4047 /// made from \p Op depending on operand type.
4048 static void broadcastSrcOp(SmallVectorImpl<SrcOp> &Ops, unsigned N,
4049                            MachineOperand &Op) {
4050   for (unsigned i = 0; i < N; ++i) {
4051     if (Op.isReg())
4052       Ops.push_back(Op.getReg());
4053     else if (Op.isImm())
4054       Ops.push_back(Op.getImm());
4055     else if (Op.isPredicate())
4056       Ops.push_back(static_cast<CmpInst::Predicate>(Op.getPredicate()));
4057     else
4058       llvm_unreachable("Unsupported type");
4059   }
4060 }
4061 
4062 // Handle splitting vector operations which need to have the same number of
4063 // elements in each type index, but each type index may have a different element
4064 // type.
4065 //
4066 // e.g.  <4 x s64> = G_SHL <4 x s64>, <4 x s32> ->
4067 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4068 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4069 //
4070 // Also handles some irregular breakdown cases, e.g.
4071 // e.g.  <3 x s64> = G_SHL <3 x s64>, <3 x s32> ->
4072 //       <2 x s64> = G_SHL <2 x s64>, <2 x s32>
4073 //             s64 = G_SHL s64, s32
4074 LegalizerHelper::LegalizeResult
4075 LegalizerHelper::fewerElementsVectorMultiEltType(
4076     GenericMachineInstr &MI, unsigned NumElts,
4077     std::initializer_list<unsigned> NonVecOpIndices) {
4078   assert(hasSameNumEltsOnAllVectorOperands(MI, MRI, NonVecOpIndices) &&
4079          "Non-compatible opcode or not specified non-vector operands");
4080   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4081 
4082   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4083   unsigned NumDefs = MI.getNumDefs();
4084 
4085   // Create DstOps (sub-vectors with NumElts elts + Leftover) for each output.
4086   // Build instructions with DstOps to use instruction found by CSE directly.
4087   // CSE copies found instruction into given vreg when building with vreg dest.
4088   SmallVector<SmallVector<DstOp, 8>, 2> OutputOpsPieces(NumDefs);
4089   // Output registers will be taken from created instructions.
4090   SmallVector<SmallVector<Register, 8>, 2> OutputRegs(NumDefs);
4091   for (unsigned i = 0; i < NumDefs; ++i) {
4092     makeDstOps(OutputOpsPieces[i], MRI.getType(MI.getReg(i)), NumElts);
4093   }
4094 
4095   // Split vector input operands into sub-vectors with NumElts elts + Leftover.
4096   // Operands listed in NonVecOpIndices will be used as is without splitting;
4097   // examples: compare predicate in icmp and fcmp (op 1), vector select with i1
4098   // scalar condition (op 1), immediate in sext_inreg (op 2).
4099   SmallVector<SmallVector<SrcOp, 8>, 3> InputOpsPieces(NumInputs);
4100   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4101        ++UseIdx, ++UseNo) {
4102     if (is_contained(NonVecOpIndices, UseIdx)) {
4103       broadcastSrcOp(InputOpsPieces[UseNo], OutputOpsPieces[0].size(),
4104                      MI.getOperand(UseIdx));
4105     } else {
4106       SmallVector<Register, 8> SplitPieces;
4107       extractVectorParts(MI.getReg(UseIdx), NumElts, SplitPieces, MIRBuilder,
4108                          MRI);
4109       for (auto Reg : SplitPieces)
4110         InputOpsPieces[UseNo].push_back(Reg);
4111     }
4112   }
4113 
4114   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4115 
4116   // Take i-th piece of each input operand split and build sub-vector/scalar
4117   // instruction. Set i-th DstOp(s) from OutputOpsPieces as destination(s).
4118   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4119     SmallVector<DstOp, 2> Defs;
4120     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4121       Defs.push_back(OutputOpsPieces[DstNo][i]);
4122 
4123     SmallVector<SrcOp, 3> Uses;
4124     for (unsigned InputNo = 0; InputNo < NumInputs; ++InputNo)
4125       Uses.push_back(InputOpsPieces[InputNo][i]);
4126 
4127     auto I = MIRBuilder.buildInstr(MI.getOpcode(), Defs, Uses, MI.getFlags());
4128     for (unsigned DstNo = 0; DstNo < NumDefs; ++DstNo)
4129       OutputRegs[DstNo].push_back(I.getReg(DstNo));
4130   }
4131 
4132   // Merge small outputs into MI's output for each def operand.
4133   if (NumLeftovers) {
4134     for (unsigned i = 0; i < NumDefs; ++i)
4135       mergeMixedSubvectors(MI.getReg(i), OutputRegs[i]);
4136   } else {
4137     for (unsigned i = 0; i < NumDefs; ++i)
4138       MIRBuilder.buildMergeLikeInstr(MI.getReg(i), OutputRegs[i]);
4139   }
4140 
4141   MI.eraseFromParent();
4142   return Legalized;
4143 }
4144 
4145 LegalizerHelper::LegalizeResult
4146 LegalizerHelper::fewerElementsVectorPhi(GenericMachineInstr &MI,
4147                                         unsigned NumElts) {
4148   unsigned OrigNumElts = MRI.getType(MI.getReg(0)).getNumElements();
4149 
4150   unsigned NumInputs = MI.getNumOperands() - MI.getNumDefs();
4151   unsigned NumDefs = MI.getNumDefs();
4152 
4153   SmallVector<DstOp, 8> OutputOpsPieces;
4154   SmallVector<Register, 8> OutputRegs;
4155   makeDstOps(OutputOpsPieces, MRI.getType(MI.getReg(0)), NumElts);
4156 
4157   // Instructions that perform register split will be inserted in basic block
4158   // where register is defined (basic block is in the next operand).
4159   SmallVector<SmallVector<Register, 8>, 3> InputOpsPieces(NumInputs / 2);
4160   for (unsigned UseIdx = NumDefs, UseNo = 0; UseIdx < MI.getNumOperands();
4161        UseIdx += 2, ++UseNo) {
4162     MachineBasicBlock &OpMBB = *MI.getOperand(UseIdx + 1).getMBB();
4163     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminatorForward());
4164     extractVectorParts(MI.getReg(UseIdx), NumElts, InputOpsPieces[UseNo],
4165                        MIRBuilder, MRI);
4166   }
4167 
4168   // Build PHIs with fewer elements.
4169   unsigned NumLeftovers = OrigNumElts % NumElts ? 1 : 0;
4170   MIRBuilder.setInsertPt(*MI.getParent(), MI);
4171   for (unsigned i = 0; i < OrigNumElts / NumElts + NumLeftovers; ++i) {
4172     auto Phi = MIRBuilder.buildInstr(TargetOpcode::G_PHI);
4173     Phi.addDef(
4174         MRI.createGenericVirtualRegister(OutputOpsPieces[i].getLLTTy(MRI)));
4175     OutputRegs.push_back(Phi.getReg(0));
4176 
4177     for (unsigned j = 0; j < NumInputs / 2; ++j) {
4178       Phi.addUse(InputOpsPieces[j][i]);
4179       Phi.add(MI.getOperand(1 + j * 2 + 1));
4180     }
4181   }
4182 
4183   // Merge small outputs into MI's def.
4184   if (NumLeftovers) {
4185     mergeMixedSubvectors(MI.getReg(0), OutputRegs);
4186   } else {
4187     MIRBuilder.buildMergeLikeInstr(MI.getReg(0), OutputRegs);
4188   }
4189 
4190   MI.eraseFromParent();
4191   return Legalized;
4192 }
4193 
4194 LegalizerHelper::LegalizeResult
4195 LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
4196                                                   unsigned TypeIdx,
4197                                                   LLT NarrowTy) {
4198   const int NumDst = MI.getNumOperands() - 1;
4199   const Register SrcReg = MI.getOperand(NumDst).getReg();
4200   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
4201   LLT SrcTy = MRI.getType(SrcReg);
4202 
4203   if (TypeIdx != 1 || NarrowTy == DstTy)
4204     return UnableToLegalize;
4205 
4206   // Requires compatible types. Otherwise SrcReg should have been defined by
4207   // merge-like instruction that would get artifact combined. Most likely
4208   // instruction that defines SrcReg has to perform more/fewer elements
4209   // legalization compatible with NarrowTy.
4210   assert(SrcTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4211   assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4212 
4213   if ((SrcTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4214       (NarrowTy.getSizeInBits() % DstTy.getSizeInBits() != 0))
4215     return UnableToLegalize;
4216 
4217   // This is most likely DstTy (smaller then register size) packed in SrcTy
4218   // (larger then register size) and since unmerge was not combined it will be
4219   // lowered to bit sequence extracts from register. Unpack SrcTy to NarrowTy
4220   // (register size) pieces first. Then unpack each of NarrowTy pieces to DstTy.
4221 
4222   // %1:_(DstTy), %2, %3, %4 = G_UNMERGE_VALUES %0:_(SrcTy)
4223   //
4224   // %5:_(NarrowTy), %6 = G_UNMERGE_VALUES %0:_(SrcTy) - reg sequence
4225   // %1:_(DstTy), %2 = G_UNMERGE_VALUES %5:_(NarrowTy) - sequence of bits in reg
4226   // %3:_(DstTy), %4 = G_UNMERGE_VALUES %6:_(NarrowTy)
4227   auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, SrcReg);
4228   const int NumUnmerge = Unmerge->getNumOperands() - 1;
4229   const int PartsPerUnmerge = NumDst / NumUnmerge;
4230 
4231   for (int I = 0; I != NumUnmerge; ++I) {
4232     auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
4233 
4234     for (int J = 0; J != PartsPerUnmerge; ++J)
4235       MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
4236     MIB.addUse(Unmerge.getReg(I));
4237   }
4238 
4239   MI.eraseFromParent();
4240   return Legalized;
4241 }
4242 
4243 LegalizerHelper::LegalizeResult
4244 LegalizerHelper::fewerElementsVectorMerge(MachineInstr &MI, unsigned TypeIdx,
4245                                           LLT NarrowTy) {
4246   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
4247   // Requires compatible types. Otherwise user of DstReg did not perform unmerge
4248   // that should have been artifact combined. Most likely instruction that uses
4249   // DstReg has to do more/fewer elements legalization compatible with NarrowTy.
4250   assert(DstTy.isVector() && NarrowTy.isVector() && "Expected vector types");
4251   assert((DstTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4252   if (NarrowTy == SrcTy)
4253     return UnableToLegalize;
4254 
4255   // This attempts to lower part of LCMTy merge/unmerge sequence. Intended use
4256   // is for old mir tests. Since the changes to more/fewer elements it should no
4257   // longer be possible to generate MIR like this when starting from llvm-ir
4258   // because LCMTy approach was replaced with merge/unmerge to vector elements.
4259   if (TypeIdx == 1) {
4260     assert(SrcTy.isVector() && "Expected vector types");
4261     assert((SrcTy.getScalarType() == NarrowTy.getScalarType()) && "bad type");
4262     if ((DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0) ||
4263         (NarrowTy.getNumElements() >= SrcTy.getNumElements()))
4264       return UnableToLegalize;
4265     // %2:_(DstTy) = G_CONCAT_VECTORS %0:_(SrcTy), %1:_(SrcTy)
4266     //
4267     // %3:_(EltTy), %4, %5 = G_UNMERGE_VALUES %0:_(SrcTy)
4268     // %6:_(EltTy), %7, %8 = G_UNMERGE_VALUES %1:_(SrcTy)
4269     // %9:_(NarrowTy) = G_BUILD_VECTOR %3:_(EltTy), %4
4270     // %10:_(NarrowTy) = G_BUILD_VECTOR %5:_(EltTy), %6
4271     // %11:_(NarrowTy) = G_BUILD_VECTOR %7:_(EltTy), %8
4272     // %2:_(DstTy) = G_CONCAT_VECTORS %9:_(NarrowTy), %10, %11
4273 
4274     SmallVector<Register, 8> Elts;
4275     LLT EltTy = MRI.getType(MI.getOperand(1).getReg()).getScalarType();
4276     for (unsigned i = 1; i < MI.getNumOperands(); ++i) {
4277       auto Unmerge = MIRBuilder.buildUnmerge(EltTy, MI.getOperand(i).getReg());
4278       for (unsigned j = 0; j < Unmerge->getNumDefs(); ++j)
4279         Elts.push_back(Unmerge.getReg(j));
4280     }
4281 
4282     SmallVector<Register, 8> NarrowTyElts;
4283     unsigned NumNarrowTyElts = NarrowTy.getNumElements();
4284     unsigned NumNarrowTyPieces = DstTy.getNumElements() / NumNarrowTyElts;
4285     for (unsigned i = 0, Offset = 0; i < NumNarrowTyPieces;
4286          ++i, Offset += NumNarrowTyElts) {
4287       ArrayRef<Register> Pieces(&Elts[Offset], NumNarrowTyElts);
4288       NarrowTyElts.push_back(
4289           MIRBuilder.buildMergeLikeInstr(NarrowTy, Pieces).getReg(0));
4290     }
4291 
4292     MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4293     MI.eraseFromParent();
4294     return Legalized;
4295   }
4296 
4297   assert(TypeIdx == 0 && "Bad type index");
4298   if ((NarrowTy.getSizeInBits() % SrcTy.getSizeInBits() != 0) ||
4299       (DstTy.getSizeInBits() % NarrowTy.getSizeInBits() != 0))
4300     return UnableToLegalize;
4301 
4302   // This is most likely SrcTy (smaller then register size) packed in DstTy
4303   // (larger then register size) and since merge was not combined it will be
4304   // lowered to bit sequence packing into register. Merge SrcTy to NarrowTy
4305   // (register size) pieces first. Then merge each of NarrowTy pieces to DstTy.
4306 
4307   // %0:_(DstTy) = G_MERGE_VALUES %1:_(SrcTy), %2, %3, %4
4308   //
4309   // %5:_(NarrowTy) = G_MERGE_VALUES %1:_(SrcTy), %2 - sequence of bits in reg
4310   // %6:_(NarrowTy) = G_MERGE_VALUES %3:_(SrcTy), %4
4311   // %0:_(DstTy)  = G_MERGE_VALUES %5:_(NarrowTy), %6 - reg sequence
4312   SmallVector<Register, 8> NarrowTyElts;
4313   unsigned NumParts = DstTy.getNumElements() / NarrowTy.getNumElements();
4314   unsigned NumSrcElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
4315   unsigned NumElts = NarrowTy.getNumElements() / NumSrcElts;
4316   for (unsigned i = 0; i < NumParts; ++i) {
4317     SmallVector<Register, 8> Sources;
4318     for (unsigned j = 0; j < NumElts; ++j)
4319       Sources.push_back(MI.getOperand(1 + i * NumElts + j).getReg());
4320     NarrowTyElts.push_back(
4321         MIRBuilder.buildMergeLikeInstr(NarrowTy, Sources).getReg(0));
4322   }
4323 
4324   MIRBuilder.buildMergeLikeInstr(DstReg, NarrowTyElts);
4325   MI.eraseFromParent();
4326   return Legalized;
4327 }
4328 
4329 LegalizerHelper::LegalizeResult
4330 LegalizerHelper::fewerElementsVectorExtractInsertVectorElt(MachineInstr &MI,
4331                                                            unsigned TypeIdx,
4332                                                            LLT NarrowVecTy) {
4333   auto [DstReg, SrcVec] = MI.getFirst2Regs();
4334   Register InsertVal;
4335   bool IsInsert = MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT;
4336 
4337   assert((IsInsert ? TypeIdx == 0 : TypeIdx == 1) && "not a vector type index");
4338   if (IsInsert)
4339     InsertVal = MI.getOperand(2).getReg();
4340 
4341   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
4342 
4343   // TODO: Handle total scalarization case.
4344   if (!NarrowVecTy.isVector())
4345     return UnableToLegalize;
4346 
4347   LLT VecTy = MRI.getType(SrcVec);
4348 
4349   // If the index is a constant, we can really break this down as you would
4350   // expect, and index into the target size pieces.
4351   int64_t IdxVal;
4352   auto MaybeCst = getIConstantVRegValWithLookThrough(Idx, MRI);
4353   if (MaybeCst) {
4354     IdxVal = MaybeCst->Value.getSExtValue();
4355     // Avoid out of bounds indexing the pieces.
4356     if (IdxVal >= VecTy.getNumElements()) {
4357       MIRBuilder.buildUndef(DstReg);
4358       MI.eraseFromParent();
4359       return Legalized;
4360     }
4361 
4362     SmallVector<Register, 8> VecParts;
4363     LLT GCDTy = extractGCDType(VecParts, VecTy, NarrowVecTy, SrcVec);
4364 
4365     // Build a sequence of NarrowTy pieces in VecParts for this operand.
4366     LLT LCMTy = buildLCMMergePieces(VecTy, NarrowVecTy, GCDTy, VecParts,
4367                                     TargetOpcode::G_ANYEXT);
4368 
4369     unsigned NewNumElts = NarrowVecTy.getNumElements();
4370 
4371     LLT IdxTy = MRI.getType(Idx);
4372     int64_t PartIdx = IdxVal / NewNumElts;
4373     auto NewIdx =
4374         MIRBuilder.buildConstant(IdxTy, IdxVal - NewNumElts * PartIdx);
4375 
4376     if (IsInsert) {
4377       LLT PartTy = MRI.getType(VecParts[PartIdx]);
4378 
4379       // Use the adjusted index to insert into one of the subvectors.
4380       auto InsertPart = MIRBuilder.buildInsertVectorElement(
4381           PartTy, VecParts[PartIdx], InsertVal, NewIdx);
4382       VecParts[PartIdx] = InsertPart.getReg(0);
4383 
4384       // Recombine the inserted subvector with the others to reform the result
4385       // vector.
4386       buildWidenedRemergeToDst(DstReg, LCMTy, VecParts);
4387     } else {
4388       MIRBuilder.buildExtractVectorElement(DstReg, VecParts[PartIdx], NewIdx);
4389     }
4390 
4391     MI.eraseFromParent();
4392     return Legalized;
4393   }
4394 
4395   // With a variable index, we can't perform the operation in a smaller type, so
4396   // we're forced to expand this.
4397   //
4398   // TODO: We could emit a chain of compare/select to figure out which piece to
4399   // index.
4400   return lowerExtractInsertVectorElt(MI);
4401 }
4402 
4403 LegalizerHelper::LegalizeResult
4404 LegalizerHelper::reduceLoadStoreWidth(GLoadStore &LdStMI, unsigned TypeIdx,
4405                                       LLT NarrowTy) {
4406   // FIXME: Don't know how to handle secondary types yet.
4407   if (TypeIdx != 0)
4408     return UnableToLegalize;
4409 
4410   // This implementation doesn't work for atomics. Give up instead of doing
4411   // something invalid.
4412   if (LdStMI.isAtomic())
4413     return UnableToLegalize;
4414 
4415   bool IsLoad = isa<GLoad>(LdStMI);
4416   Register ValReg = LdStMI.getReg(0);
4417   Register AddrReg = LdStMI.getPointerReg();
4418   LLT ValTy = MRI.getType(ValReg);
4419 
4420   // FIXME: Do we need a distinct NarrowMemory legalize action?
4421   if (ValTy.getSizeInBits() != 8 * LdStMI.getMemSize()) {
4422     LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
4423     return UnableToLegalize;
4424   }
4425 
4426   int NumParts = -1;
4427   int NumLeftover = -1;
4428   LLT LeftoverTy;
4429   SmallVector<Register, 8> NarrowRegs, NarrowLeftoverRegs;
4430   if (IsLoad) {
4431     std::tie(NumParts, NumLeftover) = getNarrowTypeBreakDown(ValTy, NarrowTy, LeftoverTy);
4432   } else {
4433     if (extractParts(ValReg, ValTy, NarrowTy, LeftoverTy, NarrowRegs,
4434                      NarrowLeftoverRegs, MIRBuilder, MRI)) {
4435       NumParts = NarrowRegs.size();
4436       NumLeftover = NarrowLeftoverRegs.size();
4437     }
4438   }
4439 
4440   if (NumParts == -1)
4441     return UnableToLegalize;
4442 
4443   LLT PtrTy = MRI.getType(AddrReg);
4444   const LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
4445 
4446   unsigned TotalSize = ValTy.getSizeInBits();
4447 
4448   // Split the load/store into PartTy sized pieces starting at Offset. If this
4449   // is a load, return the new registers in ValRegs. For a store, each elements
4450   // of ValRegs should be PartTy. Returns the next offset that needs to be
4451   // handled.
4452   bool isBigEndian = MIRBuilder.getDataLayout().isBigEndian();
4453   auto MMO = LdStMI.getMMO();
4454   auto splitTypePieces = [=](LLT PartTy, SmallVectorImpl<Register> &ValRegs,
4455                              unsigned NumParts, unsigned Offset) -> unsigned {
4456     MachineFunction &MF = MIRBuilder.getMF();
4457     unsigned PartSize = PartTy.getSizeInBits();
4458     for (unsigned Idx = 0, E = NumParts; Idx != E && Offset < TotalSize;
4459          ++Idx) {
4460       unsigned ByteOffset = Offset / 8;
4461       Register NewAddrReg;
4462 
4463       MIRBuilder.materializePtrAdd(NewAddrReg, AddrReg, OffsetTy, ByteOffset);
4464 
4465       MachineMemOperand *NewMMO =
4466           MF.getMachineMemOperand(&MMO, ByteOffset, PartTy);
4467 
4468       if (IsLoad) {
4469         Register Dst = MRI.createGenericVirtualRegister(PartTy);
4470         ValRegs.push_back(Dst);
4471         MIRBuilder.buildLoad(Dst, NewAddrReg, *NewMMO);
4472       } else {
4473         MIRBuilder.buildStore(ValRegs[Idx], NewAddrReg, *NewMMO);
4474       }
4475       Offset = isBigEndian ? Offset - PartSize : Offset + PartSize;
4476     }
4477 
4478     return Offset;
4479   };
4480 
4481   unsigned Offset = isBigEndian ? TotalSize - NarrowTy.getSizeInBits() : 0;
4482   unsigned HandledOffset =
4483       splitTypePieces(NarrowTy, NarrowRegs, NumParts, Offset);
4484 
4485   // Handle the rest of the register if this isn't an even type breakdown.
4486   if (LeftoverTy.isValid())
4487     splitTypePieces(LeftoverTy, NarrowLeftoverRegs, NumLeftover, HandledOffset);
4488 
4489   if (IsLoad) {
4490     insertParts(ValReg, ValTy, NarrowTy, NarrowRegs,
4491                 LeftoverTy, NarrowLeftoverRegs);
4492   }
4493 
4494   LdStMI.eraseFromParent();
4495   return Legalized;
4496 }
4497 
4498 LegalizerHelper::LegalizeResult
4499 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
4500                                      LLT NarrowTy) {
4501   using namespace TargetOpcode;
4502   GenericMachineInstr &GMI = cast<GenericMachineInstr>(MI);
4503   unsigned NumElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
4504 
4505   switch (MI.getOpcode()) {
4506   case G_IMPLICIT_DEF:
4507   case G_TRUNC:
4508   case G_AND:
4509   case G_OR:
4510   case G_XOR:
4511   case G_ADD:
4512   case G_SUB:
4513   case G_MUL:
4514   case G_PTR_ADD:
4515   case G_SMULH:
4516   case G_UMULH:
4517   case G_FADD:
4518   case G_FMUL:
4519   case G_FSUB:
4520   case G_FNEG:
4521   case G_FABS:
4522   case G_FCANONICALIZE:
4523   case G_FDIV:
4524   case G_FREM:
4525   case G_FMA:
4526   case G_FMAD:
4527   case G_FPOW:
4528   case G_FEXP:
4529   case G_FEXP2:
4530   case G_FEXP10:
4531   case G_FLOG:
4532   case G_FLOG2:
4533   case G_FLOG10:
4534   case G_FLDEXP:
4535   case G_FNEARBYINT:
4536   case G_FCEIL:
4537   case G_FFLOOR:
4538   case G_FRINT:
4539   case G_INTRINSIC_ROUND:
4540   case G_INTRINSIC_ROUNDEVEN:
4541   case G_INTRINSIC_TRUNC:
4542   case G_FCOS:
4543   case G_FSIN:
4544   case G_FSQRT:
4545   case G_BSWAP:
4546   case G_BITREVERSE:
4547   case G_SDIV:
4548   case G_UDIV:
4549   case G_SREM:
4550   case G_UREM:
4551   case G_SDIVREM:
4552   case G_UDIVREM:
4553   case G_SMIN:
4554   case G_SMAX:
4555   case G_UMIN:
4556   case G_UMAX:
4557   case G_ABS:
4558   case G_FMINNUM:
4559   case G_FMAXNUM:
4560   case G_FMINNUM_IEEE:
4561   case G_FMAXNUM_IEEE:
4562   case G_FMINIMUM:
4563   case G_FMAXIMUM:
4564   case G_FSHL:
4565   case G_FSHR:
4566   case G_ROTL:
4567   case G_ROTR:
4568   case G_FREEZE:
4569   case G_SADDSAT:
4570   case G_SSUBSAT:
4571   case G_UADDSAT:
4572   case G_USUBSAT:
4573   case G_UMULO:
4574   case G_SMULO:
4575   case G_SHL:
4576   case G_LSHR:
4577   case G_ASHR:
4578   case G_SSHLSAT:
4579   case G_USHLSAT:
4580   case G_CTLZ:
4581   case G_CTLZ_ZERO_UNDEF:
4582   case G_CTTZ:
4583   case G_CTTZ_ZERO_UNDEF:
4584   case G_CTPOP:
4585   case G_FCOPYSIGN:
4586   case G_ZEXT:
4587   case G_SEXT:
4588   case G_ANYEXT:
4589   case G_FPEXT:
4590   case G_FPTRUNC:
4591   case G_SITOFP:
4592   case G_UITOFP:
4593   case G_FPTOSI:
4594   case G_FPTOUI:
4595   case G_INTTOPTR:
4596   case G_PTRTOINT:
4597   case G_ADDRSPACE_CAST:
4598   case G_UADDO:
4599   case G_USUBO:
4600   case G_UADDE:
4601   case G_USUBE:
4602   case G_SADDO:
4603   case G_SSUBO:
4604   case G_SADDE:
4605   case G_SSUBE:
4606   case G_STRICT_FADD:
4607   case G_STRICT_FSUB:
4608   case G_STRICT_FMUL:
4609   case G_STRICT_FMA:
4610   case G_STRICT_FLDEXP:
4611   case G_FFREXP:
4612     return fewerElementsVectorMultiEltType(GMI, NumElts);
4613   case G_ICMP:
4614   case G_FCMP:
4615     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*cpm predicate*/});
4616   case G_IS_FPCLASS:
4617     return fewerElementsVectorMultiEltType(GMI, NumElts, {2, 3 /*mask,fpsem*/});
4618   case G_SELECT:
4619     if (MRI.getType(MI.getOperand(1).getReg()).isVector())
4620       return fewerElementsVectorMultiEltType(GMI, NumElts);
4621     return fewerElementsVectorMultiEltType(GMI, NumElts, {1 /*scalar cond*/});
4622   case G_PHI:
4623     return fewerElementsVectorPhi(GMI, NumElts);
4624   case G_UNMERGE_VALUES:
4625     return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
4626   case G_BUILD_VECTOR:
4627     assert(TypeIdx == 0 && "not a vector type index");
4628     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4629   case G_CONCAT_VECTORS:
4630     if (TypeIdx != 1) // TODO: This probably does work as expected already.
4631       return UnableToLegalize;
4632     return fewerElementsVectorMerge(MI, TypeIdx, NarrowTy);
4633   case G_EXTRACT_VECTOR_ELT:
4634   case G_INSERT_VECTOR_ELT:
4635     return fewerElementsVectorExtractInsertVectorElt(MI, TypeIdx, NarrowTy);
4636   case G_LOAD:
4637   case G_STORE:
4638     return reduceLoadStoreWidth(cast<GLoadStore>(MI), TypeIdx, NarrowTy);
4639   case G_SEXT_INREG:
4640     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*imm*/});
4641   GISEL_VECREDUCE_CASES_NONSEQ
4642     return fewerElementsVectorReductions(MI, TypeIdx, NarrowTy);
4643   case TargetOpcode::G_VECREDUCE_SEQ_FADD:
4644   case TargetOpcode::G_VECREDUCE_SEQ_FMUL:
4645     return fewerElementsVectorSeqReductions(MI, TypeIdx, NarrowTy);
4646   case G_SHUFFLE_VECTOR:
4647     return fewerElementsVectorShuffle(MI, TypeIdx, NarrowTy);
4648   case G_FPOWI:
4649     return fewerElementsVectorMultiEltType(GMI, NumElts, {2 /*pow*/});
4650   default:
4651     return UnableToLegalize;
4652   }
4653 }
4654 
4655 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
4656     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4657   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
4658   if (TypeIdx != 0)
4659     return UnableToLegalize;
4660 
4661   auto [DstReg, DstTy, Src1Reg, Src1Ty, Src2Reg, Src2Ty] =
4662       MI.getFirst3RegLLTs();
4663   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
4664   // The shuffle should be canonicalized by now.
4665   if (DstTy != Src1Ty)
4666     return UnableToLegalize;
4667   if (DstTy != Src2Ty)
4668     return UnableToLegalize;
4669 
4670   if (!isPowerOf2_32(DstTy.getNumElements()))
4671     return UnableToLegalize;
4672 
4673   // We only support splitting a shuffle into 2, so adjust NarrowTy accordingly.
4674   // Further legalization attempts will be needed to do split further.
4675   NarrowTy =
4676       DstTy.changeElementCount(DstTy.getElementCount().divideCoefficientBy(2));
4677   unsigned NewElts = NarrowTy.getNumElements();
4678 
4679   SmallVector<Register> SplitSrc1Regs, SplitSrc2Regs;
4680   extractParts(Src1Reg, NarrowTy, 2, SplitSrc1Regs, MIRBuilder, MRI);
4681   extractParts(Src2Reg, NarrowTy, 2, SplitSrc2Regs, MIRBuilder, MRI);
4682   Register Inputs[4] = {SplitSrc1Regs[0], SplitSrc1Regs[1], SplitSrc2Regs[0],
4683                         SplitSrc2Regs[1]};
4684 
4685   Register Hi, Lo;
4686 
4687   // If Lo or Hi uses elements from at most two of the four input vectors, then
4688   // express it as a vector shuffle of those two inputs.  Otherwise extract the
4689   // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
4690   SmallVector<int, 16> Ops;
4691   for (unsigned High = 0; High < 2; ++High) {
4692     Register &Output = High ? Hi : Lo;
4693 
4694     // Build a shuffle mask for the output, discovering on the fly which
4695     // input vectors to use as shuffle operands (recorded in InputUsed).
4696     // If building a suitable shuffle vector proves too hard, then bail
4697     // out with useBuildVector set.
4698     unsigned InputUsed[2] = {-1U, -1U}; // Not yet discovered.
4699     unsigned FirstMaskIdx = High * NewElts;
4700     bool UseBuildVector = false;
4701     for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4702       // The mask element.  This indexes into the input.
4703       int Idx = Mask[FirstMaskIdx + MaskOffset];
4704 
4705       // The input vector this mask element indexes into.
4706       unsigned Input = (unsigned)Idx / NewElts;
4707 
4708       if (Input >= std::size(Inputs)) {
4709         // The mask element does not index into any input vector.
4710         Ops.push_back(-1);
4711         continue;
4712       }
4713 
4714       // Turn the index into an offset from the start of the input vector.
4715       Idx -= Input * NewElts;
4716 
4717       // Find or create a shuffle vector operand to hold this input.
4718       unsigned OpNo;
4719       for (OpNo = 0; OpNo < std::size(InputUsed); ++OpNo) {
4720         if (InputUsed[OpNo] == Input) {
4721           // This input vector is already an operand.
4722           break;
4723         } else if (InputUsed[OpNo] == -1U) {
4724           // Create a new operand for this input vector.
4725           InputUsed[OpNo] = Input;
4726           break;
4727         }
4728       }
4729 
4730       if (OpNo >= std::size(InputUsed)) {
4731         // More than two input vectors used!  Give up on trying to create a
4732         // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
4733         UseBuildVector = true;
4734         break;
4735       }
4736 
4737       // Add the mask index for the new shuffle vector.
4738       Ops.push_back(Idx + OpNo * NewElts);
4739     }
4740 
4741     if (UseBuildVector) {
4742       LLT EltTy = NarrowTy.getElementType();
4743       SmallVector<Register, 16> SVOps;
4744 
4745       // Extract the input elements by hand.
4746       for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
4747         // The mask element.  This indexes into the input.
4748         int Idx = Mask[FirstMaskIdx + MaskOffset];
4749 
4750         // The input vector this mask element indexes into.
4751         unsigned Input = (unsigned)Idx / NewElts;
4752 
4753         if (Input >= std::size(Inputs)) {
4754           // The mask element is "undef" or indexes off the end of the input.
4755           SVOps.push_back(MIRBuilder.buildUndef(EltTy).getReg(0));
4756           continue;
4757         }
4758 
4759         // Turn the index into an offset from the start of the input vector.
4760         Idx -= Input * NewElts;
4761 
4762         // Extract the vector element by hand.
4763         SVOps.push_back(MIRBuilder
4764                             .buildExtractVectorElement(
4765                                 EltTy, Inputs[Input],
4766                                 MIRBuilder.buildConstant(LLT::scalar(32), Idx))
4767                             .getReg(0));
4768       }
4769 
4770       // Construct the Lo/Hi output using a G_BUILD_VECTOR.
4771       Output = MIRBuilder.buildBuildVector(NarrowTy, SVOps).getReg(0);
4772     } else if (InputUsed[0] == -1U) {
4773       // No input vectors were used! The result is undefined.
4774       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
4775     } else {
4776       Register Op0 = Inputs[InputUsed[0]];
4777       // If only one input was used, use an undefined vector for the other.
4778       Register Op1 = InputUsed[1] == -1U
4779                          ? MIRBuilder.buildUndef(NarrowTy).getReg(0)
4780                          : Inputs[InputUsed[1]];
4781       // At least one input vector was used. Create a new shuffle vector.
4782       Output = MIRBuilder.buildShuffleVector(NarrowTy, Op0, Op1, Ops).getReg(0);
4783     }
4784 
4785     Ops.clear();
4786   }
4787 
4788   MIRBuilder.buildConcatVectors(DstReg, {Lo, Hi});
4789   MI.eraseFromParent();
4790   return Legalized;
4791 }
4792 
4793 LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorReductions(
4794     MachineInstr &MI, unsigned int TypeIdx, LLT NarrowTy) {
4795   auto &RdxMI = cast<GVecReduce>(MI);
4796 
4797   if (TypeIdx != 1)
4798     return UnableToLegalize;
4799 
4800   // The semantics of the normal non-sequential reductions allow us to freely
4801   // re-associate the operation.
4802   auto [DstReg, DstTy, SrcReg, SrcTy] = RdxMI.getFirst2RegLLTs();
4803 
4804   if (NarrowTy.isVector() &&
4805       (SrcTy.getNumElements() % NarrowTy.getNumElements() != 0))
4806     return UnableToLegalize;
4807 
4808   unsigned ScalarOpc = RdxMI.getScalarOpcForReduction();
4809   SmallVector<Register> SplitSrcs;
4810   // If NarrowTy is a scalar then we're being asked to scalarize.
4811   const unsigned NumParts =
4812       NarrowTy.isVector() ? SrcTy.getNumElements() / NarrowTy.getNumElements()
4813                           : SrcTy.getNumElements();
4814 
4815   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
4816   if (NarrowTy.isScalar()) {
4817     if (DstTy != NarrowTy)
4818       return UnableToLegalize; // FIXME: handle implicit extensions.
4819 
4820     if (isPowerOf2_32(NumParts)) {
4821       // Generate a tree of scalar operations to reduce the critical path.
4822       SmallVector<Register> PartialResults;
4823       unsigned NumPartsLeft = NumParts;
4824       while (NumPartsLeft > 1) {
4825         for (unsigned Idx = 0; Idx < NumPartsLeft - 1; Idx += 2) {
4826           PartialResults.emplace_back(
4827               MIRBuilder
4828                   .buildInstr(ScalarOpc, {NarrowTy},
4829                               {SplitSrcs[Idx], SplitSrcs[Idx + 1]})
4830                   .getReg(0));
4831         }
4832         SplitSrcs = PartialResults;
4833         PartialResults.clear();
4834         NumPartsLeft = SplitSrcs.size();
4835       }
4836       assert(SplitSrcs.size() == 1);
4837       MIRBuilder.buildCopy(DstReg, SplitSrcs[0]);
4838       MI.eraseFromParent();
4839       return Legalized;
4840     }
4841     // If we can't generate a tree, then just do sequential operations.
4842     Register Acc = SplitSrcs[0];
4843     for (unsigned Idx = 1; Idx < NumParts; ++Idx)
4844       Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[Idx]})
4845                 .getReg(0);
4846     MIRBuilder.buildCopy(DstReg, Acc);
4847     MI.eraseFromParent();
4848     return Legalized;
4849   }
4850   SmallVector<Register> PartialReductions;
4851   for (unsigned Part = 0; Part < NumParts; ++Part) {
4852     PartialReductions.push_back(
4853         MIRBuilder.buildInstr(RdxMI.getOpcode(), {DstTy}, {SplitSrcs[Part]})
4854             .getReg(0));
4855   }
4856 
4857   // If the types involved are powers of 2, we can generate intermediate vector
4858   // ops, before generating a final reduction operation.
4859   if (isPowerOf2_32(SrcTy.getNumElements()) &&
4860       isPowerOf2_32(NarrowTy.getNumElements())) {
4861     return tryNarrowPow2Reduction(MI, SrcReg, SrcTy, NarrowTy, ScalarOpc);
4862   }
4863 
4864   Register Acc = PartialReductions[0];
4865   for (unsigned Part = 1; Part < NumParts; ++Part) {
4866     if (Part == NumParts - 1) {
4867       MIRBuilder.buildInstr(ScalarOpc, {DstReg},
4868                             {Acc, PartialReductions[Part]});
4869     } else {
4870       Acc = MIRBuilder
4871                 .buildInstr(ScalarOpc, {DstTy}, {Acc, PartialReductions[Part]})
4872                 .getReg(0);
4873     }
4874   }
4875   MI.eraseFromParent();
4876   return Legalized;
4877 }
4878 
4879 LegalizerHelper::LegalizeResult
4880 LegalizerHelper::fewerElementsVectorSeqReductions(MachineInstr &MI,
4881                                                   unsigned int TypeIdx,
4882                                                   LLT NarrowTy) {
4883   auto [DstReg, DstTy, ScalarReg, ScalarTy, SrcReg, SrcTy] =
4884       MI.getFirst3RegLLTs();
4885   if (!NarrowTy.isScalar() || TypeIdx != 2 || DstTy != ScalarTy ||
4886       DstTy != NarrowTy)
4887     return UnableToLegalize;
4888 
4889   assert((MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD ||
4890           MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FMUL) &&
4891          "Unexpected vecreduce opcode");
4892   unsigned ScalarOpc = MI.getOpcode() == TargetOpcode::G_VECREDUCE_SEQ_FADD
4893                            ? TargetOpcode::G_FADD
4894                            : TargetOpcode::G_FMUL;
4895 
4896   SmallVector<Register> SplitSrcs;
4897   unsigned NumParts = SrcTy.getNumElements();
4898   extractParts(SrcReg, NarrowTy, NumParts, SplitSrcs, MIRBuilder, MRI);
4899   Register Acc = ScalarReg;
4900   for (unsigned i = 0; i < NumParts; i++)
4901     Acc = MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {Acc, SplitSrcs[i]})
4902               .getReg(0);
4903 
4904   MIRBuilder.buildCopy(DstReg, Acc);
4905   MI.eraseFromParent();
4906   return Legalized;
4907 }
4908 
4909 LegalizerHelper::LegalizeResult
4910 LegalizerHelper::tryNarrowPow2Reduction(MachineInstr &MI, Register SrcReg,
4911                                         LLT SrcTy, LLT NarrowTy,
4912                                         unsigned ScalarOpc) {
4913   SmallVector<Register> SplitSrcs;
4914   // Split the sources into NarrowTy size pieces.
4915   extractParts(SrcReg, NarrowTy,
4916                SrcTy.getNumElements() / NarrowTy.getNumElements(), SplitSrcs,
4917                MIRBuilder, MRI);
4918   // We're going to do a tree reduction using vector operations until we have
4919   // one NarrowTy size value left.
4920   while (SplitSrcs.size() > 1) {
4921     SmallVector<Register> PartialRdxs;
4922     for (unsigned Idx = 0; Idx < SplitSrcs.size()-1; Idx += 2) {
4923       Register LHS = SplitSrcs[Idx];
4924       Register RHS = SplitSrcs[Idx + 1];
4925       // Create the intermediate vector op.
4926       Register Res =
4927           MIRBuilder.buildInstr(ScalarOpc, {NarrowTy}, {LHS, RHS}).getReg(0);
4928       PartialRdxs.push_back(Res);
4929     }
4930     SplitSrcs = std::move(PartialRdxs);
4931   }
4932   // Finally generate the requested NarrowTy based reduction.
4933   Observer.changingInstr(MI);
4934   MI.getOperand(1).setReg(SplitSrcs[0]);
4935   Observer.changedInstr(MI);
4936   return Legalized;
4937 }
4938 
4939 LegalizerHelper::LegalizeResult
4940 LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
4941                                              const LLT HalfTy, const LLT AmtTy) {
4942 
4943   Register InL = MRI.createGenericVirtualRegister(HalfTy);
4944   Register InH = MRI.createGenericVirtualRegister(HalfTy);
4945   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
4946 
4947   if (Amt.isZero()) {
4948     MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {InL, InH});
4949     MI.eraseFromParent();
4950     return Legalized;
4951   }
4952 
4953   LLT NVT = HalfTy;
4954   unsigned NVTBits = HalfTy.getSizeInBits();
4955   unsigned VTBits = 2 * NVTBits;
4956 
4957   SrcOp Lo(Register(0)), Hi(Register(0));
4958   if (MI.getOpcode() == TargetOpcode::G_SHL) {
4959     if (Amt.ugt(VTBits)) {
4960       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4961     } else if (Amt.ugt(NVTBits)) {
4962       Lo = MIRBuilder.buildConstant(NVT, 0);
4963       Hi = MIRBuilder.buildShl(NVT, InL,
4964                                MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4965     } else if (Amt == NVTBits) {
4966       Lo = MIRBuilder.buildConstant(NVT, 0);
4967       Hi = InL;
4968     } else {
4969       Lo = MIRBuilder.buildShl(NVT, InL, MIRBuilder.buildConstant(AmtTy, Amt));
4970       auto OrLHS =
4971           MIRBuilder.buildShl(NVT, InH, MIRBuilder.buildConstant(AmtTy, Amt));
4972       auto OrRHS = MIRBuilder.buildLShr(
4973           NVT, InL, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4974       Hi = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4975     }
4976   } else if (MI.getOpcode() == TargetOpcode::G_LSHR) {
4977     if (Amt.ugt(VTBits)) {
4978       Lo = Hi = MIRBuilder.buildConstant(NVT, 0);
4979     } else if (Amt.ugt(NVTBits)) {
4980       Lo = MIRBuilder.buildLShr(NVT, InH,
4981                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
4982       Hi = MIRBuilder.buildConstant(NVT, 0);
4983     } else if (Amt == NVTBits) {
4984       Lo = InH;
4985       Hi = MIRBuilder.buildConstant(NVT, 0);
4986     } else {
4987       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
4988 
4989       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
4990       auto OrRHS = MIRBuilder.buildShl(
4991           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
4992 
4993       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
4994       Hi = MIRBuilder.buildLShr(NVT, InH, ShiftAmtConst);
4995     }
4996   } else {
4997     if (Amt.ugt(VTBits)) {
4998       Hi = Lo = MIRBuilder.buildAShr(
4999           NVT, InH, MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5000     } else if (Amt.ugt(NVTBits)) {
5001       Lo = MIRBuilder.buildAShr(NVT, InH,
5002                                 MIRBuilder.buildConstant(AmtTy, Amt - NVTBits));
5003       Hi = MIRBuilder.buildAShr(NVT, InH,
5004                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5005     } else if (Amt == NVTBits) {
5006       Lo = InH;
5007       Hi = MIRBuilder.buildAShr(NVT, InH,
5008                                 MIRBuilder.buildConstant(AmtTy, NVTBits - 1));
5009     } else {
5010       auto ShiftAmtConst = MIRBuilder.buildConstant(AmtTy, Amt);
5011 
5012       auto OrLHS = MIRBuilder.buildLShr(NVT, InL, ShiftAmtConst);
5013       auto OrRHS = MIRBuilder.buildShl(
5014           NVT, InH, MIRBuilder.buildConstant(AmtTy, -Amt + NVTBits));
5015 
5016       Lo = MIRBuilder.buildOr(NVT, OrLHS, OrRHS);
5017       Hi = MIRBuilder.buildAShr(NVT, InH, ShiftAmtConst);
5018     }
5019   }
5020 
5021   MIRBuilder.buildMergeLikeInstr(MI.getOperand(0), {Lo, Hi});
5022   MI.eraseFromParent();
5023 
5024   return Legalized;
5025 }
5026 
5027 // TODO: Optimize if constant shift amount.
5028 LegalizerHelper::LegalizeResult
5029 LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
5030                                    LLT RequestedTy) {
5031   if (TypeIdx == 1) {
5032     Observer.changingInstr(MI);
5033     narrowScalarSrc(MI, RequestedTy, 2);
5034     Observer.changedInstr(MI);
5035     return Legalized;
5036   }
5037 
5038   Register DstReg = MI.getOperand(0).getReg();
5039   LLT DstTy = MRI.getType(DstReg);
5040   if (DstTy.isVector())
5041     return UnableToLegalize;
5042 
5043   Register Amt = MI.getOperand(2).getReg();
5044   LLT ShiftAmtTy = MRI.getType(Amt);
5045   const unsigned DstEltSize = DstTy.getScalarSizeInBits();
5046   if (DstEltSize % 2 != 0)
5047     return UnableToLegalize;
5048 
5049   // Ignore the input type. We can only go to exactly half the size of the
5050   // input. If that isn't small enough, the resulting pieces will be further
5051   // legalized.
5052   const unsigned NewBitSize = DstEltSize / 2;
5053   const LLT HalfTy = LLT::scalar(NewBitSize);
5054   const LLT CondTy = LLT::scalar(1);
5055 
5056   if (auto VRegAndVal = getIConstantVRegValWithLookThrough(Amt, MRI)) {
5057     return narrowScalarShiftByConstant(MI, VRegAndVal->Value, HalfTy,
5058                                        ShiftAmtTy);
5059   }
5060 
5061   // TODO: Expand with known bits.
5062 
5063   // Handle the fully general expansion by an unknown amount.
5064   auto NewBits = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize);
5065 
5066   Register InL = MRI.createGenericVirtualRegister(HalfTy);
5067   Register InH = MRI.createGenericVirtualRegister(HalfTy);
5068   MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
5069 
5070   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
5071   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
5072 
5073   auto Zero = MIRBuilder.buildConstant(ShiftAmtTy, 0);
5074   auto IsShort = MIRBuilder.buildICmp(ICmpInst::ICMP_ULT, CondTy, Amt, NewBits);
5075   auto IsZero = MIRBuilder.buildICmp(ICmpInst::ICMP_EQ, CondTy, Amt, Zero);
5076 
5077   Register ResultRegs[2];
5078   switch (MI.getOpcode()) {
5079   case TargetOpcode::G_SHL: {
5080     // Short: ShAmt < NewBitSize
5081     auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
5082 
5083     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
5084     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
5085     auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5086 
5087     // Long: ShAmt >= NewBitSize
5088     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
5089     auto HiL = MIRBuilder.buildShl(HalfTy, InL, AmtExcess); // Hi from Lo part.
5090 
5091     auto Lo = MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL);
5092     auto Hi = MIRBuilder.buildSelect(
5093         HalfTy, IsZero, InH, MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL));
5094 
5095     ResultRegs[0] = Lo.getReg(0);
5096     ResultRegs[1] = Hi.getReg(0);
5097     break;
5098   }
5099   case TargetOpcode::G_LSHR:
5100   case TargetOpcode::G_ASHR: {
5101     // Short: ShAmt < NewBitSize
5102     auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
5103 
5104     auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
5105     auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
5106     auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
5107 
5108     // Long: ShAmt >= NewBitSize
5109     MachineInstrBuilder HiL;
5110     if (MI.getOpcode() == TargetOpcode::G_LSHR) {
5111       HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
5112     } else {
5113       auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
5114       HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
5115     }
5116     auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
5117                                      {InH, AmtExcess});     // Lo from Hi part.
5118 
5119     auto Lo = MIRBuilder.buildSelect(
5120         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
5121 
5122     auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
5123 
5124     ResultRegs[0] = Lo.getReg(0);
5125     ResultRegs[1] = Hi.getReg(0);
5126     break;
5127   }
5128   default:
5129     llvm_unreachable("not a shift");
5130   }
5131 
5132   MIRBuilder.buildMergeLikeInstr(DstReg, ResultRegs);
5133   MI.eraseFromParent();
5134   return Legalized;
5135 }
5136 
5137 LegalizerHelper::LegalizeResult
5138 LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
5139                                        LLT MoreTy) {
5140   assert(TypeIdx == 0 && "Expecting only Idx 0");
5141 
5142   Observer.changingInstr(MI);
5143   for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
5144     MachineBasicBlock &OpMBB = *MI.getOperand(I + 1).getMBB();
5145     MIRBuilder.setInsertPt(OpMBB, OpMBB.getFirstTerminator());
5146     moreElementsVectorSrc(MI, MoreTy, I);
5147   }
5148 
5149   MachineBasicBlock &MBB = *MI.getParent();
5150   MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
5151   moreElementsVectorDst(MI, MoreTy, 0);
5152   Observer.changedInstr(MI);
5153   return Legalized;
5154 }
5155 
5156 LegalizerHelper::LegalizeResult
5157 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
5158                                     LLT MoreTy) {
5159   unsigned Opc = MI.getOpcode();
5160   switch (Opc) {
5161   case TargetOpcode::G_IMPLICIT_DEF:
5162   case TargetOpcode::G_LOAD: {
5163     if (TypeIdx != 0)
5164       return UnableToLegalize;
5165     Observer.changingInstr(MI);
5166     moreElementsVectorDst(MI, MoreTy, 0);
5167     Observer.changedInstr(MI);
5168     return Legalized;
5169   }
5170   case TargetOpcode::G_STORE:
5171     if (TypeIdx != 0)
5172       return UnableToLegalize;
5173     Observer.changingInstr(MI);
5174     moreElementsVectorSrc(MI, MoreTy, 0);
5175     Observer.changedInstr(MI);
5176     return Legalized;
5177   case TargetOpcode::G_AND:
5178   case TargetOpcode::G_OR:
5179   case TargetOpcode::G_XOR:
5180   case TargetOpcode::G_ADD:
5181   case TargetOpcode::G_SUB:
5182   case TargetOpcode::G_MUL:
5183   case TargetOpcode::G_FADD:
5184   case TargetOpcode::G_FSUB:
5185   case TargetOpcode::G_FMUL:
5186   case TargetOpcode::G_FDIV:
5187   case TargetOpcode::G_UADDSAT:
5188   case TargetOpcode::G_USUBSAT:
5189   case TargetOpcode::G_SADDSAT:
5190   case TargetOpcode::G_SSUBSAT:
5191   case TargetOpcode::G_SMIN:
5192   case TargetOpcode::G_SMAX:
5193   case TargetOpcode::G_UMIN:
5194   case TargetOpcode::G_UMAX:
5195   case TargetOpcode::G_FMINNUM:
5196   case TargetOpcode::G_FMAXNUM:
5197   case TargetOpcode::G_FMINNUM_IEEE:
5198   case TargetOpcode::G_FMAXNUM_IEEE:
5199   case TargetOpcode::G_FMINIMUM:
5200   case TargetOpcode::G_FMAXIMUM:
5201   case TargetOpcode::G_STRICT_FADD:
5202   case TargetOpcode::G_STRICT_FSUB:
5203   case TargetOpcode::G_STRICT_FMUL:
5204   case TargetOpcode::G_SHL:
5205   case TargetOpcode::G_ASHR:
5206   case TargetOpcode::G_LSHR: {
5207     Observer.changingInstr(MI);
5208     moreElementsVectorSrc(MI, MoreTy, 1);
5209     moreElementsVectorSrc(MI, MoreTy, 2);
5210     moreElementsVectorDst(MI, MoreTy, 0);
5211     Observer.changedInstr(MI);
5212     return Legalized;
5213   }
5214   case TargetOpcode::G_FMA:
5215   case TargetOpcode::G_STRICT_FMA:
5216   case TargetOpcode::G_FSHR:
5217   case TargetOpcode::G_FSHL: {
5218     Observer.changingInstr(MI);
5219     moreElementsVectorSrc(MI, MoreTy, 1);
5220     moreElementsVectorSrc(MI, MoreTy, 2);
5221     moreElementsVectorSrc(MI, MoreTy, 3);
5222     moreElementsVectorDst(MI, MoreTy, 0);
5223     Observer.changedInstr(MI);
5224     return Legalized;
5225   }
5226   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
5227   case TargetOpcode::G_EXTRACT:
5228     if (TypeIdx != 1)
5229       return UnableToLegalize;
5230     Observer.changingInstr(MI);
5231     moreElementsVectorSrc(MI, MoreTy, 1);
5232     Observer.changedInstr(MI);
5233     return Legalized;
5234   case TargetOpcode::G_INSERT:
5235   case TargetOpcode::G_INSERT_VECTOR_ELT:
5236   case TargetOpcode::G_FREEZE:
5237   case TargetOpcode::G_FNEG:
5238   case TargetOpcode::G_FABS:
5239   case TargetOpcode::G_FSQRT:
5240   case TargetOpcode::G_FCEIL:
5241   case TargetOpcode::G_FFLOOR:
5242   case TargetOpcode::G_FNEARBYINT:
5243   case TargetOpcode::G_FRINT:
5244   case TargetOpcode::G_INTRINSIC_ROUND:
5245   case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
5246   case TargetOpcode::G_INTRINSIC_TRUNC:
5247   case TargetOpcode::G_BSWAP:
5248   case TargetOpcode::G_FCANONICALIZE:
5249   case TargetOpcode::G_SEXT_INREG:
5250     if (TypeIdx != 0)
5251       return UnableToLegalize;
5252     Observer.changingInstr(MI);
5253     moreElementsVectorSrc(MI, MoreTy, 1);
5254     moreElementsVectorDst(MI, MoreTy, 0);
5255     Observer.changedInstr(MI);
5256     return Legalized;
5257   case TargetOpcode::G_SELECT: {
5258     auto [DstReg, DstTy, CondReg, CondTy] = MI.getFirst2RegLLTs();
5259     if (TypeIdx == 1) {
5260       if (!CondTy.isScalar() ||
5261           DstTy.getElementCount() != MoreTy.getElementCount())
5262         return UnableToLegalize;
5263 
5264       // This is turning a scalar select of vectors into a vector
5265       // select. Broadcast the select condition.
5266       auto ShufSplat = MIRBuilder.buildShuffleSplat(MoreTy, CondReg);
5267       Observer.changingInstr(MI);
5268       MI.getOperand(1).setReg(ShufSplat.getReg(0));
5269       Observer.changedInstr(MI);
5270       return Legalized;
5271     }
5272 
5273     if (CondTy.isVector())
5274       return UnableToLegalize;
5275 
5276     Observer.changingInstr(MI);
5277     moreElementsVectorSrc(MI, MoreTy, 2);
5278     moreElementsVectorSrc(MI, MoreTy, 3);
5279     moreElementsVectorDst(MI, MoreTy, 0);
5280     Observer.changedInstr(MI);
5281     return Legalized;
5282   }
5283   case TargetOpcode::G_UNMERGE_VALUES:
5284     return UnableToLegalize;
5285   case TargetOpcode::G_PHI:
5286     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
5287   case TargetOpcode::G_SHUFFLE_VECTOR:
5288     return moreElementsVectorShuffle(MI, TypeIdx, MoreTy);
5289   case TargetOpcode::G_BUILD_VECTOR: {
5290     SmallVector<SrcOp, 8> Elts;
5291     for (auto Op : MI.uses()) {
5292       Elts.push_back(Op.getReg());
5293     }
5294 
5295     for (unsigned i = Elts.size(); i < MoreTy.getNumElements(); ++i) {
5296       Elts.push_back(MIRBuilder.buildUndef(MoreTy.getScalarType()));
5297     }
5298 
5299     MIRBuilder.buildDeleteTrailingVectorElements(
5300         MI.getOperand(0).getReg(), MIRBuilder.buildInstr(Opc, {MoreTy}, Elts));
5301     MI.eraseFromParent();
5302     return Legalized;
5303   }
5304   case TargetOpcode::G_TRUNC:
5305   case TargetOpcode::G_FPTRUNC:
5306   case TargetOpcode::G_FPEXT:
5307   case TargetOpcode::G_FPTOSI:
5308   case TargetOpcode::G_FPTOUI:
5309   case TargetOpcode::G_SITOFP:
5310   case TargetOpcode::G_UITOFP: {
5311     if (TypeIdx != 0)
5312       return UnableToLegalize;
5313     Observer.changingInstr(MI);
5314     LLT SrcTy = LLT::fixed_vector(
5315         MoreTy.getNumElements(),
5316         MRI.getType(MI.getOperand(1).getReg()).getElementType());
5317     moreElementsVectorSrc(MI, SrcTy, 1);
5318     moreElementsVectorDst(MI, MoreTy, 0);
5319     Observer.changedInstr(MI);
5320     return Legalized;
5321   }
5322   case TargetOpcode::G_ICMP: {
5323     // TODO: the symmetric MoreTy works for targets like, e.g. NEON.
5324     // For targets, like e.g. MVE, the result is a predicated vector (i1).
5325     // This will need some refactoring.
5326     Observer.changingInstr(MI);
5327     moreElementsVectorSrc(MI, MoreTy, 2);
5328     moreElementsVectorSrc(MI, MoreTy, 3);
5329     moreElementsVectorDst(MI, MoreTy, 0);
5330     Observer.changedInstr(MI);
5331     return Legalized;
5332   }
5333   default:
5334     return UnableToLegalize;
5335   }
5336 }
5337 
5338 LegalizerHelper::LegalizeResult
5339 LegalizerHelper::equalizeVectorShuffleLengths(MachineInstr &MI) {
5340   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5341   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5342   unsigned MaskNumElts = Mask.size();
5343   unsigned SrcNumElts = SrcTy.getNumElements();
5344   LLT DestEltTy = DstTy.getElementType();
5345 
5346   if (MaskNumElts == SrcNumElts)
5347     return Legalized;
5348 
5349   if (MaskNumElts < SrcNumElts) {
5350     // Extend mask to match new destination vector size with
5351     // undef values.
5352     SmallVector<int, 16> NewMask(Mask);
5353     for (unsigned I = MaskNumElts; I < SrcNumElts; ++I)
5354       NewMask.push_back(-1);
5355 
5356     moreElementsVectorDst(MI, SrcTy, 0);
5357     MIRBuilder.setInstrAndDebugLoc(MI);
5358     MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5359                                   MI.getOperand(1).getReg(),
5360                                   MI.getOperand(2).getReg(), NewMask);
5361     MI.eraseFromParent();
5362 
5363     return Legalized;
5364   }
5365 
5366   unsigned PaddedMaskNumElts = alignTo(MaskNumElts, SrcNumElts);
5367   unsigned NumConcat = PaddedMaskNumElts / SrcNumElts;
5368   LLT PaddedTy = LLT::fixed_vector(PaddedMaskNumElts, DestEltTy);
5369 
5370   // Create new source vectors by concatenating the initial
5371   // source vectors with undefined vectors of the same size.
5372   auto Undef = MIRBuilder.buildUndef(SrcTy);
5373   SmallVector<Register, 8> MOps1(NumConcat, Undef.getReg(0));
5374   SmallVector<Register, 8> MOps2(NumConcat, Undef.getReg(0));
5375   MOps1[0] = MI.getOperand(1).getReg();
5376   MOps2[0] = MI.getOperand(2).getReg();
5377 
5378   auto Src1 = MIRBuilder.buildConcatVectors(PaddedTy, MOps1);
5379   auto Src2 = MIRBuilder.buildConcatVectors(PaddedTy, MOps2);
5380 
5381   // Readjust mask for new input vector length.
5382   SmallVector<int, 8> MappedOps(PaddedMaskNumElts, -1);
5383   for (unsigned I = 0; I != MaskNumElts; ++I) {
5384     int Idx = Mask[I];
5385     if (Idx >= static_cast<int>(SrcNumElts))
5386       Idx += PaddedMaskNumElts - SrcNumElts;
5387     MappedOps[I] = Idx;
5388   }
5389 
5390   // If we got more elements than required, extract subvector.
5391   if (MaskNumElts != PaddedMaskNumElts) {
5392     auto Shuffle =
5393         MIRBuilder.buildShuffleVector(PaddedTy, Src1, Src2, MappedOps);
5394 
5395     SmallVector<Register, 16> Elts(MaskNumElts);
5396     for (unsigned I = 0; I < MaskNumElts; ++I) {
5397       Elts[I] =
5398           MIRBuilder.buildExtractVectorElementConstant(DestEltTy, Shuffle, I)
5399               .getReg(0);
5400     }
5401     MIRBuilder.buildBuildVector(DstReg, Elts);
5402   } else {
5403     MIRBuilder.buildShuffleVector(DstReg, Src1, Src2, MappedOps);
5404   }
5405 
5406   MI.eraseFromParent();
5407   return LegalizerHelper::LegalizeResult::Legalized;
5408 }
5409 
5410 LegalizerHelper::LegalizeResult
5411 LegalizerHelper::moreElementsVectorShuffle(MachineInstr &MI,
5412                                            unsigned int TypeIdx, LLT MoreTy) {
5413   auto [DstTy, Src1Ty, Src2Ty] = MI.getFirst3LLTs();
5414   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
5415   unsigned NumElts = DstTy.getNumElements();
5416   unsigned WidenNumElts = MoreTy.getNumElements();
5417 
5418   if (DstTy.isVector() && Src1Ty.isVector() &&
5419       DstTy.getNumElements() != Src1Ty.getNumElements()) {
5420     return equalizeVectorShuffleLengths(MI);
5421   }
5422 
5423   if (TypeIdx != 0)
5424     return UnableToLegalize;
5425 
5426   // Expect a canonicalized shuffle.
5427   if (DstTy != Src1Ty || DstTy != Src2Ty)
5428     return UnableToLegalize;
5429 
5430   moreElementsVectorSrc(MI, MoreTy, 1);
5431   moreElementsVectorSrc(MI, MoreTy, 2);
5432 
5433   // Adjust mask based on new input vector length.
5434   SmallVector<int, 16> NewMask;
5435   for (unsigned I = 0; I != NumElts; ++I) {
5436     int Idx = Mask[I];
5437     if (Idx < static_cast<int>(NumElts))
5438       NewMask.push_back(Idx);
5439     else
5440       NewMask.push_back(Idx - NumElts + WidenNumElts);
5441   }
5442   for (unsigned I = NumElts; I != WidenNumElts; ++I)
5443     NewMask.push_back(-1);
5444   moreElementsVectorDst(MI, MoreTy, 0);
5445   MIRBuilder.setInstrAndDebugLoc(MI);
5446   MIRBuilder.buildShuffleVector(MI.getOperand(0).getReg(),
5447                                 MI.getOperand(1).getReg(),
5448                                 MI.getOperand(2).getReg(), NewMask);
5449   MI.eraseFromParent();
5450   return Legalized;
5451 }
5452 
5453 void LegalizerHelper::multiplyRegisters(SmallVectorImpl<Register> &DstRegs,
5454                                         ArrayRef<Register> Src1Regs,
5455                                         ArrayRef<Register> Src2Regs,
5456                                         LLT NarrowTy) {
5457   MachineIRBuilder &B = MIRBuilder;
5458   unsigned SrcParts = Src1Regs.size();
5459   unsigned DstParts = DstRegs.size();
5460 
5461   unsigned DstIdx = 0; // Low bits of the result.
5462   Register FactorSum =
5463       B.buildMul(NarrowTy, Src1Regs[DstIdx], Src2Regs[DstIdx]).getReg(0);
5464   DstRegs[DstIdx] = FactorSum;
5465 
5466   unsigned CarrySumPrevDstIdx;
5467   SmallVector<Register, 4> Factors;
5468 
5469   for (DstIdx = 1; DstIdx < DstParts; DstIdx++) {
5470     // Collect low parts of muls for DstIdx.
5471     for (unsigned i = DstIdx + 1 < SrcParts ? 0 : DstIdx - SrcParts + 1;
5472          i <= std::min(DstIdx, SrcParts - 1); ++i) {
5473       MachineInstrBuilder Mul =
5474           B.buildMul(NarrowTy, Src1Regs[DstIdx - i], Src2Regs[i]);
5475       Factors.push_back(Mul.getReg(0));
5476     }
5477     // Collect high parts of muls from previous DstIdx.
5478     for (unsigned i = DstIdx < SrcParts ? 0 : DstIdx - SrcParts;
5479          i <= std::min(DstIdx - 1, SrcParts - 1); ++i) {
5480       MachineInstrBuilder Umulh =
5481           B.buildUMulH(NarrowTy, Src1Regs[DstIdx - 1 - i], Src2Regs[i]);
5482       Factors.push_back(Umulh.getReg(0));
5483     }
5484     // Add CarrySum from additions calculated for previous DstIdx.
5485     if (DstIdx != 1) {
5486       Factors.push_back(CarrySumPrevDstIdx);
5487     }
5488 
5489     Register CarrySum;
5490     // Add all factors and accumulate all carries into CarrySum.
5491     if (DstIdx != DstParts - 1) {
5492       MachineInstrBuilder Uaddo =
5493           B.buildUAddo(NarrowTy, LLT::scalar(1), Factors[0], Factors[1]);
5494       FactorSum = Uaddo.getReg(0);
5495       CarrySum = B.buildZExt(NarrowTy, Uaddo.getReg(1)).getReg(0);
5496       for (unsigned i = 2; i < Factors.size(); ++i) {
5497         MachineInstrBuilder Uaddo =
5498             B.buildUAddo(NarrowTy, LLT::scalar(1), FactorSum, Factors[i]);
5499         FactorSum = Uaddo.getReg(0);
5500         MachineInstrBuilder Carry = B.buildZExt(NarrowTy, Uaddo.getReg(1));
5501         CarrySum = B.buildAdd(NarrowTy, CarrySum, Carry).getReg(0);
5502       }
5503     } else {
5504       // Since value for the next index is not calculated, neither is CarrySum.
5505       FactorSum = B.buildAdd(NarrowTy, Factors[0], Factors[1]).getReg(0);
5506       for (unsigned i = 2; i < Factors.size(); ++i)
5507         FactorSum = B.buildAdd(NarrowTy, FactorSum, Factors[i]).getReg(0);
5508     }
5509 
5510     CarrySumPrevDstIdx = CarrySum;
5511     DstRegs[DstIdx] = FactorSum;
5512     Factors.clear();
5513   }
5514 }
5515 
5516 LegalizerHelper::LegalizeResult
5517 LegalizerHelper::narrowScalarAddSub(MachineInstr &MI, unsigned TypeIdx,
5518                                     LLT NarrowTy) {
5519   if (TypeIdx != 0)
5520     return UnableToLegalize;
5521 
5522   Register DstReg = MI.getOperand(0).getReg();
5523   LLT DstType = MRI.getType(DstReg);
5524   // FIXME: add support for vector types
5525   if (DstType.isVector())
5526     return UnableToLegalize;
5527 
5528   unsigned Opcode = MI.getOpcode();
5529   unsigned OpO, OpE, OpF;
5530   switch (Opcode) {
5531   case TargetOpcode::G_SADDO:
5532   case TargetOpcode::G_SADDE:
5533   case TargetOpcode::G_UADDO:
5534   case TargetOpcode::G_UADDE:
5535   case TargetOpcode::G_ADD:
5536     OpO = TargetOpcode::G_UADDO;
5537     OpE = TargetOpcode::G_UADDE;
5538     OpF = TargetOpcode::G_UADDE;
5539     if (Opcode == TargetOpcode::G_SADDO || Opcode == TargetOpcode::G_SADDE)
5540       OpF = TargetOpcode::G_SADDE;
5541     break;
5542   case TargetOpcode::G_SSUBO:
5543   case TargetOpcode::G_SSUBE:
5544   case TargetOpcode::G_USUBO:
5545   case TargetOpcode::G_USUBE:
5546   case TargetOpcode::G_SUB:
5547     OpO = TargetOpcode::G_USUBO;
5548     OpE = TargetOpcode::G_USUBE;
5549     OpF = TargetOpcode::G_USUBE;
5550     if (Opcode == TargetOpcode::G_SSUBO || Opcode == TargetOpcode::G_SSUBE)
5551       OpF = TargetOpcode::G_SSUBE;
5552     break;
5553   default:
5554     llvm_unreachable("Unexpected add/sub opcode!");
5555   }
5556 
5557   // 1 for a plain add/sub, 2 if this is an operation with a carry-out.
5558   unsigned NumDefs = MI.getNumExplicitDefs();
5559   Register Src1 = MI.getOperand(NumDefs).getReg();
5560   Register Src2 = MI.getOperand(NumDefs + 1).getReg();
5561   Register CarryDst, CarryIn;
5562   if (NumDefs == 2)
5563     CarryDst = MI.getOperand(1).getReg();
5564   if (MI.getNumOperands() == NumDefs + 3)
5565     CarryIn = MI.getOperand(NumDefs + 2).getReg();
5566 
5567   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5568   LLT LeftoverTy, DummyTy;
5569   SmallVector<Register, 2> Src1Regs, Src2Regs, Src1Left, Src2Left, DstRegs;
5570   extractParts(Src1, RegTy, NarrowTy, LeftoverTy, Src1Regs, Src1Left,
5571                MIRBuilder, MRI);
5572   extractParts(Src2, RegTy, NarrowTy, DummyTy, Src2Regs, Src2Left, MIRBuilder,
5573                MRI);
5574 
5575   int NarrowParts = Src1Regs.size();
5576   for (int I = 0, E = Src1Left.size(); I != E; ++I) {
5577     Src1Regs.push_back(Src1Left[I]);
5578     Src2Regs.push_back(Src2Left[I]);
5579   }
5580   DstRegs.reserve(Src1Regs.size());
5581 
5582   for (int i = 0, e = Src1Regs.size(); i != e; ++i) {
5583     Register DstReg =
5584         MRI.createGenericVirtualRegister(MRI.getType(Src1Regs[i]));
5585     Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
5586     // Forward the final carry-out to the destination register
5587     if (i == e - 1 && CarryDst)
5588       CarryOut = CarryDst;
5589 
5590     if (!CarryIn) {
5591       MIRBuilder.buildInstr(OpO, {DstReg, CarryOut},
5592                             {Src1Regs[i], Src2Regs[i]});
5593     } else if (i == e - 1) {
5594       MIRBuilder.buildInstr(OpF, {DstReg, CarryOut},
5595                             {Src1Regs[i], Src2Regs[i], CarryIn});
5596     } else {
5597       MIRBuilder.buildInstr(OpE, {DstReg, CarryOut},
5598                             {Src1Regs[i], Src2Regs[i], CarryIn});
5599     }
5600 
5601     DstRegs.push_back(DstReg);
5602     CarryIn = CarryOut;
5603   }
5604   insertParts(MI.getOperand(0).getReg(), RegTy, NarrowTy,
5605               ArrayRef(DstRegs).take_front(NarrowParts), LeftoverTy,
5606               ArrayRef(DstRegs).drop_front(NarrowParts));
5607 
5608   MI.eraseFromParent();
5609   return Legalized;
5610 }
5611 
5612 LegalizerHelper::LegalizeResult
5613 LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
5614   auto [DstReg, Src1, Src2] = MI.getFirst3Regs();
5615 
5616   LLT Ty = MRI.getType(DstReg);
5617   if (Ty.isVector())
5618     return UnableToLegalize;
5619 
5620   unsigned Size = Ty.getSizeInBits();
5621   unsigned NarrowSize = NarrowTy.getSizeInBits();
5622   if (Size % NarrowSize != 0)
5623     return UnableToLegalize;
5624 
5625   unsigned NumParts = Size / NarrowSize;
5626   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
5627   unsigned DstTmpParts = NumParts * (IsMulHigh ? 2 : 1);
5628 
5629   SmallVector<Register, 2> Src1Parts, Src2Parts;
5630   SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
5631   extractParts(Src1, NarrowTy, NumParts, Src1Parts, MIRBuilder, MRI);
5632   extractParts(Src2, NarrowTy, NumParts, Src2Parts, MIRBuilder, MRI);
5633   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
5634 
5635   // Take only high half of registers if this is high mul.
5636   ArrayRef<Register> DstRegs(&DstTmpRegs[DstTmpParts - NumParts], NumParts);
5637   MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5638   MI.eraseFromParent();
5639   return Legalized;
5640 }
5641 
5642 LegalizerHelper::LegalizeResult
5643 LegalizerHelper::narrowScalarFPTOI(MachineInstr &MI, unsigned TypeIdx,
5644                                    LLT NarrowTy) {
5645   if (TypeIdx != 0)
5646     return UnableToLegalize;
5647 
5648   bool IsSigned = MI.getOpcode() == TargetOpcode::G_FPTOSI;
5649 
5650   Register Src = MI.getOperand(1).getReg();
5651   LLT SrcTy = MRI.getType(Src);
5652 
5653   // If all finite floats fit into the narrowed integer type, we can just swap
5654   // out the result type. This is practically only useful for conversions from
5655   // half to at least 16-bits, so just handle the one case.
5656   if (SrcTy.getScalarType() != LLT::scalar(16) ||
5657       NarrowTy.getScalarSizeInBits() < (IsSigned ? 17u : 16u))
5658     return UnableToLegalize;
5659 
5660   Observer.changingInstr(MI);
5661   narrowScalarDst(MI, NarrowTy, 0,
5662                   IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT);
5663   Observer.changedInstr(MI);
5664   return Legalized;
5665 }
5666 
5667 LegalizerHelper::LegalizeResult
5668 LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
5669                                      LLT NarrowTy) {
5670   if (TypeIdx != 1)
5671     return UnableToLegalize;
5672 
5673   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5674 
5675   int64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5676   // FIXME: add support for when SizeOp1 isn't an exact multiple of
5677   // NarrowSize.
5678   if (SizeOp1 % NarrowSize != 0)
5679     return UnableToLegalize;
5680   int NumParts = SizeOp1 / NarrowSize;
5681 
5682   SmallVector<Register, 2> SrcRegs, DstRegs;
5683   SmallVector<uint64_t, 2> Indexes;
5684   extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs,
5685                MIRBuilder, MRI);
5686 
5687   Register OpReg = MI.getOperand(0).getReg();
5688   uint64_t OpStart = MI.getOperand(2).getImm();
5689   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5690   for (int i = 0; i < NumParts; ++i) {
5691     unsigned SrcStart = i * NarrowSize;
5692 
5693     if (SrcStart + NarrowSize <= OpStart || SrcStart >= OpStart + OpSize) {
5694       // No part of the extract uses this subregister, ignore it.
5695       continue;
5696     } else if (SrcStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5697       // The entire subregister is extracted, forward the value.
5698       DstRegs.push_back(SrcRegs[i]);
5699       continue;
5700     }
5701 
5702     // OpSegStart is where this destination segment would start in OpReg if it
5703     // extended infinitely in both directions.
5704     int64_t ExtractOffset;
5705     uint64_t SegSize;
5706     if (OpStart < SrcStart) {
5707       ExtractOffset = 0;
5708       SegSize = std::min(NarrowSize, OpStart + OpSize - SrcStart);
5709     } else {
5710       ExtractOffset = OpStart - SrcStart;
5711       SegSize = std::min(SrcStart + NarrowSize - OpStart, OpSize);
5712     }
5713 
5714     Register SegReg = SrcRegs[i];
5715     if (ExtractOffset != 0 || SegSize != NarrowSize) {
5716       // A genuine extract is needed.
5717       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5718       MIRBuilder.buildExtract(SegReg, SrcRegs[i], ExtractOffset);
5719     }
5720 
5721     DstRegs.push_back(SegReg);
5722   }
5723 
5724   Register DstReg = MI.getOperand(0).getReg();
5725   if (MRI.getType(DstReg).isVector())
5726     MIRBuilder.buildBuildVector(DstReg, DstRegs);
5727   else if (DstRegs.size() > 1)
5728     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5729   else
5730     MIRBuilder.buildCopy(DstReg, DstRegs[0]);
5731   MI.eraseFromParent();
5732   return Legalized;
5733 }
5734 
5735 LegalizerHelper::LegalizeResult
5736 LegalizerHelper::narrowScalarInsert(MachineInstr &MI, unsigned TypeIdx,
5737                                     LLT NarrowTy) {
5738   // FIXME: Don't know how to handle secondary types yet.
5739   if (TypeIdx != 0)
5740     return UnableToLegalize;
5741 
5742   SmallVector<Register, 2> SrcRegs, LeftoverRegs, DstRegs;
5743   SmallVector<uint64_t, 2> Indexes;
5744   LLT RegTy = MRI.getType(MI.getOperand(0).getReg());
5745   LLT LeftoverTy;
5746   extractParts(MI.getOperand(1).getReg(), RegTy, NarrowTy, LeftoverTy, SrcRegs,
5747                LeftoverRegs, MIRBuilder, MRI);
5748 
5749   for (Register Reg : LeftoverRegs)
5750     SrcRegs.push_back(Reg);
5751 
5752   uint64_t NarrowSize = NarrowTy.getSizeInBits();
5753   Register OpReg = MI.getOperand(2).getReg();
5754   uint64_t OpStart = MI.getOperand(3).getImm();
5755   uint64_t OpSize = MRI.getType(OpReg).getSizeInBits();
5756   for (int I = 0, E = SrcRegs.size(); I != E; ++I) {
5757     unsigned DstStart = I * NarrowSize;
5758 
5759     if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
5760       // The entire subregister is defined by this insert, forward the new
5761       // value.
5762       DstRegs.push_back(OpReg);
5763       continue;
5764     }
5765 
5766     Register SrcReg = SrcRegs[I];
5767     if (MRI.getType(SrcRegs[I]) == LeftoverTy) {
5768       // The leftover reg is smaller than NarrowTy, so we need to extend it.
5769       SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
5770       MIRBuilder.buildAnyExt(SrcReg, SrcRegs[I]);
5771     }
5772 
5773     if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
5774       // No part of the insert affects this subregister, forward the original.
5775       DstRegs.push_back(SrcReg);
5776       continue;
5777     }
5778 
5779     // OpSegStart is where this destination segment would start in OpReg if it
5780     // extended infinitely in both directions.
5781     int64_t ExtractOffset, InsertOffset;
5782     uint64_t SegSize;
5783     if (OpStart < DstStart) {
5784       InsertOffset = 0;
5785       ExtractOffset = DstStart - OpStart;
5786       SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
5787     } else {
5788       InsertOffset = OpStart - DstStart;
5789       ExtractOffset = 0;
5790       SegSize =
5791         std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
5792     }
5793 
5794     Register SegReg = OpReg;
5795     if (ExtractOffset != 0 || SegSize != OpSize) {
5796       // A genuine extract is needed.
5797       SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
5798       MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
5799     }
5800 
5801     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
5802     MIRBuilder.buildInsert(DstReg, SrcReg, SegReg, InsertOffset);
5803     DstRegs.push_back(DstReg);
5804   }
5805 
5806   uint64_t WideSize = DstRegs.size() * NarrowSize;
5807   Register DstReg = MI.getOperand(0).getReg();
5808   if (WideSize > RegTy.getSizeInBits()) {
5809     Register MergeReg = MRI.createGenericVirtualRegister(LLT::scalar(WideSize));
5810     MIRBuilder.buildMergeLikeInstr(MergeReg, DstRegs);
5811     MIRBuilder.buildTrunc(DstReg, MergeReg);
5812   } else
5813     MIRBuilder.buildMergeLikeInstr(DstReg, DstRegs);
5814 
5815   MI.eraseFromParent();
5816   return Legalized;
5817 }
5818 
5819 LegalizerHelper::LegalizeResult
5820 LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
5821                                    LLT NarrowTy) {
5822   Register DstReg = MI.getOperand(0).getReg();
5823   LLT DstTy = MRI.getType(DstReg);
5824 
5825   assert(MI.getNumOperands() == 3 && TypeIdx == 0);
5826 
5827   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5828   SmallVector<Register, 4> Src0Regs, Src0LeftoverRegs;
5829   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5830   LLT LeftoverTy;
5831   if (!extractParts(MI.getOperand(1).getReg(), DstTy, NarrowTy, LeftoverTy,
5832                     Src0Regs, Src0LeftoverRegs, MIRBuilder, MRI))
5833     return UnableToLegalize;
5834 
5835   LLT Unused;
5836   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, Unused,
5837                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
5838     llvm_unreachable("inconsistent extractParts result");
5839 
5840   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5841     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
5842                                         {Src0Regs[I], Src1Regs[I]});
5843     DstRegs.push_back(Inst.getReg(0));
5844   }
5845 
5846   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5847     auto Inst = MIRBuilder.buildInstr(
5848       MI.getOpcode(),
5849       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
5850     DstLeftoverRegs.push_back(Inst.getReg(0));
5851   }
5852 
5853   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5854               LeftoverTy, DstLeftoverRegs);
5855 
5856   MI.eraseFromParent();
5857   return Legalized;
5858 }
5859 
5860 LegalizerHelper::LegalizeResult
5861 LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
5862                                  LLT NarrowTy) {
5863   if (TypeIdx != 0)
5864     return UnableToLegalize;
5865 
5866   auto [DstReg, SrcReg] = MI.getFirst2Regs();
5867 
5868   LLT DstTy = MRI.getType(DstReg);
5869   if (DstTy.isVector())
5870     return UnableToLegalize;
5871 
5872   SmallVector<Register, 8> Parts;
5873   LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
5874   LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
5875   buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
5876 
5877   MI.eraseFromParent();
5878   return Legalized;
5879 }
5880 
5881 LegalizerHelper::LegalizeResult
5882 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
5883                                     LLT NarrowTy) {
5884   if (TypeIdx != 0)
5885     return UnableToLegalize;
5886 
5887   Register CondReg = MI.getOperand(1).getReg();
5888   LLT CondTy = MRI.getType(CondReg);
5889   if (CondTy.isVector()) // TODO: Handle vselect
5890     return UnableToLegalize;
5891 
5892   Register DstReg = MI.getOperand(0).getReg();
5893   LLT DstTy = MRI.getType(DstReg);
5894 
5895   SmallVector<Register, 4> DstRegs, DstLeftoverRegs;
5896   SmallVector<Register, 4> Src1Regs, Src1LeftoverRegs;
5897   SmallVector<Register, 4> Src2Regs, Src2LeftoverRegs;
5898   LLT LeftoverTy;
5899   if (!extractParts(MI.getOperand(2).getReg(), DstTy, NarrowTy, LeftoverTy,
5900                     Src1Regs, Src1LeftoverRegs, MIRBuilder, MRI))
5901     return UnableToLegalize;
5902 
5903   LLT Unused;
5904   if (!extractParts(MI.getOperand(3).getReg(), DstTy, NarrowTy, Unused,
5905                     Src2Regs, Src2LeftoverRegs, MIRBuilder, MRI))
5906     llvm_unreachable("inconsistent extractParts result");
5907 
5908   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
5909     auto Select = MIRBuilder.buildSelect(NarrowTy,
5910                                          CondReg, Src1Regs[I], Src2Regs[I]);
5911     DstRegs.push_back(Select.getReg(0));
5912   }
5913 
5914   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
5915     auto Select = MIRBuilder.buildSelect(
5916       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
5917     DstLeftoverRegs.push_back(Select.getReg(0));
5918   }
5919 
5920   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
5921               LeftoverTy, DstLeftoverRegs);
5922 
5923   MI.eraseFromParent();
5924   return Legalized;
5925 }
5926 
5927 LegalizerHelper::LegalizeResult
5928 LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
5929                                   LLT NarrowTy) {
5930   if (TypeIdx != 1)
5931     return UnableToLegalize;
5932 
5933   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5934   unsigned NarrowSize = NarrowTy.getSizeInBits();
5935 
5936   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5937     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
5938 
5939     MachineIRBuilder &B = MIRBuilder;
5940     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5941     // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
5942     auto C_0 = B.buildConstant(NarrowTy, 0);
5943     auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5944                                 UnmergeSrc.getReg(1), C_0);
5945     auto LoCTLZ = IsUndef ?
5946       B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
5947       B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
5948     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5949     auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
5950     auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
5951     B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
5952 
5953     MI.eraseFromParent();
5954     return Legalized;
5955   }
5956 
5957   return UnableToLegalize;
5958 }
5959 
5960 LegalizerHelper::LegalizeResult
5961 LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
5962                                   LLT NarrowTy) {
5963   if (TypeIdx != 1)
5964     return UnableToLegalize;
5965 
5966   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
5967   unsigned NarrowSize = NarrowTy.getSizeInBits();
5968 
5969   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
5970     const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
5971 
5972     MachineIRBuilder &B = MIRBuilder;
5973     auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
5974     // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
5975     auto C_0 = B.buildConstant(NarrowTy, 0);
5976     auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
5977                                 UnmergeSrc.getReg(0), C_0);
5978     auto HiCTTZ = IsUndef ?
5979       B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
5980       B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
5981     auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
5982     auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
5983     auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
5984     B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
5985 
5986     MI.eraseFromParent();
5987     return Legalized;
5988   }
5989 
5990   return UnableToLegalize;
5991 }
5992 
5993 LegalizerHelper::LegalizeResult
5994 LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
5995                                    LLT NarrowTy) {
5996   if (TypeIdx != 1)
5997     return UnableToLegalize;
5998 
5999   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6000   unsigned NarrowSize = NarrowTy.getSizeInBits();
6001 
6002   if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
6003     auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
6004 
6005     auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
6006     auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
6007     MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
6008 
6009     MI.eraseFromParent();
6010     return Legalized;
6011   }
6012 
6013   return UnableToLegalize;
6014 }
6015 
6016 LegalizerHelper::LegalizeResult
6017 LegalizerHelper::narrowScalarFLDEXP(MachineInstr &MI, unsigned TypeIdx,
6018                                     LLT NarrowTy) {
6019   if (TypeIdx != 1)
6020     return UnableToLegalize;
6021 
6022   MachineIRBuilder &B = MIRBuilder;
6023   Register ExpReg = MI.getOperand(2).getReg();
6024   LLT ExpTy = MRI.getType(ExpReg);
6025 
6026   unsigned ClampSize = NarrowTy.getScalarSizeInBits();
6027 
6028   // Clamp the exponent to the range of the target type.
6029   auto MinExp = B.buildConstant(ExpTy, minIntN(ClampSize));
6030   auto ClampMin = B.buildSMax(ExpTy, ExpReg, MinExp);
6031   auto MaxExp = B.buildConstant(ExpTy, maxIntN(ClampSize));
6032   auto Clamp = B.buildSMin(ExpTy, ClampMin, MaxExp);
6033 
6034   auto Trunc = B.buildTrunc(NarrowTy, Clamp);
6035   Observer.changingInstr(MI);
6036   MI.getOperand(2).setReg(Trunc.getReg(0));
6037   Observer.changedInstr(MI);
6038   return Legalized;
6039 }
6040 
6041 LegalizerHelper::LegalizeResult
6042 LegalizerHelper::lowerBitCount(MachineInstr &MI) {
6043   unsigned Opc = MI.getOpcode();
6044   const auto &TII = MIRBuilder.getTII();
6045   auto isSupported = [this](const LegalityQuery &Q) {
6046     auto QAction = LI.getAction(Q).Action;
6047     return QAction == Legal || QAction == Libcall || QAction == Custom;
6048   };
6049   switch (Opc) {
6050   default:
6051     return UnableToLegalize;
6052   case TargetOpcode::G_CTLZ_ZERO_UNDEF: {
6053     // This trivially expands to CTLZ.
6054     Observer.changingInstr(MI);
6055     MI.setDesc(TII.get(TargetOpcode::G_CTLZ));
6056     Observer.changedInstr(MI);
6057     return Legalized;
6058   }
6059   case TargetOpcode::G_CTLZ: {
6060     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6061     unsigned Len = SrcTy.getSizeInBits();
6062 
6063     if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6064       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
6065       auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
6066       auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
6067       auto ICmp = MIRBuilder.buildICmp(
6068           CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
6069       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6070       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
6071       MI.eraseFromParent();
6072       return Legalized;
6073     }
6074     // for now, we do this:
6075     // NewLen = NextPowerOf2(Len);
6076     // x = x | (x >> 1);
6077     // x = x | (x >> 2);
6078     // ...
6079     // x = x | (x >>16);
6080     // x = x | (x >>32); // for 64-bit input
6081     // Upto NewLen/2
6082     // return Len - popcount(x);
6083     //
6084     // Ref: "Hacker's Delight" by Henry Warren
6085     Register Op = SrcReg;
6086     unsigned NewLen = PowerOf2Ceil(Len);
6087     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
6088       auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
6089       auto MIBOp = MIRBuilder.buildOr(
6090           SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
6091       Op = MIBOp.getReg(0);
6092     }
6093     auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
6094     MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
6095                         MIBPop);
6096     MI.eraseFromParent();
6097     return Legalized;
6098   }
6099   case TargetOpcode::G_CTTZ_ZERO_UNDEF: {
6100     // This trivially expands to CTTZ.
6101     Observer.changingInstr(MI);
6102     MI.setDesc(TII.get(TargetOpcode::G_CTTZ));
6103     Observer.changedInstr(MI);
6104     return Legalized;
6105   }
6106   case TargetOpcode::G_CTTZ: {
6107     auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
6108 
6109     unsigned Len = SrcTy.getSizeInBits();
6110     if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
6111       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
6112       // zero.
6113       auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
6114       auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
6115       auto ICmp = MIRBuilder.buildICmp(
6116           CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
6117       auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
6118       MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
6119       MI.eraseFromParent();
6120       return Legalized;
6121     }
6122     // for now, we use: { return popcount(~x & (x - 1)); }
6123     // unless the target has ctlz but not ctpop, in which case we use:
6124     // { return 32 - nlz(~x & (x-1)); }
6125     // Ref: "Hacker's Delight" by Henry Warren
6126     auto MIBCstNeg1 = MIRBuilder.buildConstant(SrcTy, -1);
6127     auto MIBNot = MIRBuilder.buildXor(SrcTy, SrcReg, MIBCstNeg1);
6128     auto MIBTmp = MIRBuilder.buildAnd(
6129         SrcTy, MIBNot, MIRBuilder.buildAdd(SrcTy, SrcReg, MIBCstNeg1));
6130     if (!isSupported({TargetOpcode::G_CTPOP, {SrcTy, SrcTy}}) &&
6131         isSupported({TargetOpcode::G_CTLZ, {SrcTy, SrcTy}})) {
6132       auto MIBCstLen = MIRBuilder.buildConstant(SrcTy, Len);
6133       MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
6134                           MIRBuilder.buildCTLZ(SrcTy, MIBTmp));
6135       MI.eraseFromParent();
6136       return Legalized;
6137     }
6138     Observer.changingInstr(MI);
6139     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
6140     MI.getOperand(1).setReg(MIBTmp.getReg(0));
6141     Observer.changedInstr(MI);
6142     return Legalized;
6143   }
6144   case TargetOpcode::G_CTPOP: {
6145     Register SrcReg = MI.getOperand(1).getReg();
6146     LLT Ty = MRI.getType(SrcReg);
6147     unsigned Size = Ty.getSizeInBits();
6148     MachineIRBuilder &B = MIRBuilder;
6149 
6150     // Count set bits in blocks of 2 bits. Default approach would be
6151     // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
6152     // We use following formula instead:
6153     // B2Count = val - { (val >> 1) & 0x55555555 }
6154     // since it gives same result in blocks of 2 with one instruction less.
6155     auto C_1 = B.buildConstant(Ty, 1);
6156     auto B2Set1LoTo1Hi = B.buildLShr(Ty, SrcReg, C_1);
6157     APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
6158     auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
6159     auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
6160     auto B2Count = B.buildSub(Ty, SrcReg, B2Count1Hi);
6161 
6162     // In order to get count in blocks of 4 add values from adjacent block of 2.
6163     // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
6164     auto C_2 = B.buildConstant(Ty, 2);
6165     auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
6166     APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
6167     auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
6168     auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
6169     auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
6170     auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
6171 
6172     // For count in blocks of 8 bits we don't have to mask high 4 bits before
6173     // addition since count value sits in range {0,...,8} and 4 bits are enough
6174     // to hold such binary values. After addition high 4 bits still hold count
6175     // of set bits in high 4 bit block, set them to zero and get 8 bit result.
6176     // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
6177     auto C_4 = B.buildConstant(Ty, 4);
6178     auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
6179     auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
6180     APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
6181     auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
6182     auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
6183 
6184     assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
6185     // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
6186     // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
6187     auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
6188     auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
6189 
6190     // Shift count result from 8 high bits to low bits.
6191     auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
6192     B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
6193 
6194     MI.eraseFromParent();
6195     return Legalized;
6196   }
6197   }
6198 }
6199 
6200 // Check that (every element of) Reg is undef or not an exact multiple of BW.
6201 static bool isNonZeroModBitWidthOrUndef(const MachineRegisterInfo &MRI,
6202                                         Register Reg, unsigned BW) {
6203   return matchUnaryPredicate(
6204       MRI, Reg,
6205       [=](const Constant *C) {
6206         // Null constant here means an undef.
6207         const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C);
6208         return !CI || CI->getValue().urem(BW) != 0;
6209       },
6210       /*AllowUndefs*/ true);
6211 }
6212 
6213 LegalizerHelper::LegalizeResult
6214 LegalizerHelper::lowerFunnelShiftWithInverse(MachineInstr &MI) {
6215   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6216   LLT Ty = MRI.getType(Dst);
6217   LLT ShTy = MRI.getType(Z);
6218 
6219   unsigned BW = Ty.getScalarSizeInBits();
6220 
6221   if (!isPowerOf2_32(BW))
6222     return UnableToLegalize;
6223 
6224   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6225   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6226 
6227   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6228     // fshl X, Y, Z -> fshr X, Y, -Z
6229     // fshr X, Y, Z -> fshl X, Y, -Z
6230     auto Zero = MIRBuilder.buildConstant(ShTy, 0);
6231     Z = MIRBuilder.buildSub(Ty, Zero, Z).getReg(0);
6232   } else {
6233     // fshl X, Y, Z -> fshr (srl X, 1), (fshr X, Y, 1), ~Z
6234     // fshr X, Y, Z -> fshl (fshl X, Y, 1), (shl Y, 1), ~Z
6235     auto One = MIRBuilder.buildConstant(ShTy, 1);
6236     if (IsFSHL) {
6237       Y = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6238       X = MIRBuilder.buildLShr(Ty, X, One).getReg(0);
6239     } else {
6240       X = MIRBuilder.buildInstr(RevOpcode, {Ty}, {X, Y, One}).getReg(0);
6241       Y = MIRBuilder.buildShl(Ty, Y, One).getReg(0);
6242     }
6243 
6244     Z = MIRBuilder.buildNot(ShTy, Z).getReg(0);
6245   }
6246 
6247   MIRBuilder.buildInstr(RevOpcode, {Dst}, {X, Y, Z});
6248   MI.eraseFromParent();
6249   return Legalized;
6250 }
6251 
6252 LegalizerHelper::LegalizeResult
6253 LegalizerHelper::lowerFunnelShiftAsShifts(MachineInstr &MI) {
6254   auto [Dst, X, Y, Z] = MI.getFirst4Regs();
6255   LLT Ty = MRI.getType(Dst);
6256   LLT ShTy = MRI.getType(Z);
6257 
6258   const unsigned BW = Ty.getScalarSizeInBits();
6259   const bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6260 
6261   Register ShX, ShY;
6262   Register ShAmt, InvShAmt;
6263 
6264   // FIXME: Emit optimized urem by constant instead of letting it expand later.
6265   if (isNonZeroModBitWidthOrUndef(MRI, Z, BW)) {
6266     // fshl: X << C | Y >> (BW - C)
6267     // fshr: X << (BW - C) | Y >> C
6268     // where C = Z % BW is not zero
6269     auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6270     ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6271     InvShAmt = MIRBuilder.buildSub(ShTy, BitWidthC, ShAmt).getReg(0);
6272     ShX = MIRBuilder.buildShl(Ty, X, IsFSHL ? ShAmt : InvShAmt).getReg(0);
6273     ShY = MIRBuilder.buildLShr(Ty, Y, IsFSHL ? InvShAmt : ShAmt).getReg(0);
6274   } else {
6275     // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
6276     // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
6277     auto Mask = MIRBuilder.buildConstant(ShTy, BW - 1);
6278     if (isPowerOf2_32(BW)) {
6279       // Z % BW -> Z & (BW - 1)
6280       ShAmt = MIRBuilder.buildAnd(ShTy, Z, Mask).getReg(0);
6281       // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
6282       auto NotZ = MIRBuilder.buildNot(ShTy, Z);
6283       InvShAmt = MIRBuilder.buildAnd(ShTy, NotZ, Mask).getReg(0);
6284     } else {
6285       auto BitWidthC = MIRBuilder.buildConstant(ShTy, BW);
6286       ShAmt = MIRBuilder.buildURem(ShTy, Z, BitWidthC).getReg(0);
6287       InvShAmt = MIRBuilder.buildSub(ShTy, Mask, ShAmt).getReg(0);
6288     }
6289 
6290     auto One = MIRBuilder.buildConstant(ShTy, 1);
6291     if (IsFSHL) {
6292       ShX = MIRBuilder.buildShl(Ty, X, ShAmt).getReg(0);
6293       auto ShY1 = MIRBuilder.buildLShr(Ty, Y, One);
6294       ShY = MIRBuilder.buildLShr(Ty, ShY1, InvShAmt).getReg(0);
6295     } else {
6296       auto ShX1 = MIRBuilder.buildShl(Ty, X, One);
6297       ShX = MIRBuilder.buildShl(Ty, ShX1, InvShAmt).getReg(0);
6298       ShY = MIRBuilder.buildLShr(Ty, Y, ShAmt).getReg(0);
6299     }
6300   }
6301 
6302   MIRBuilder.buildOr(Dst, ShX, ShY);
6303   MI.eraseFromParent();
6304   return Legalized;
6305 }
6306 
6307 LegalizerHelper::LegalizeResult
6308 LegalizerHelper::lowerFunnelShift(MachineInstr &MI) {
6309   // These operations approximately do the following (while avoiding undefined
6310   // shifts by BW):
6311   // G_FSHL: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
6312   // G_FSHR: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
6313   Register Dst = MI.getOperand(0).getReg();
6314   LLT Ty = MRI.getType(Dst);
6315   LLT ShTy = MRI.getType(MI.getOperand(3).getReg());
6316 
6317   bool IsFSHL = MI.getOpcode() == TargetOpcode::G_FSHL;
6318   unsigned RevOpcode = IsFSHL ? TargetOpcode::G_FSHR : TargetOpcode::G_FSHL;
6319 
6320   // TODO: Use smarter heuristic that accounts for vector legalization.
6321   if (LI.getAction({RevOpcode, {Ty, ShTy}}).Action == Lower)
6322     return lowerFunnelShiftAsShifts(MI);
6323 
6324   // This only works for powers of 2, fallback to shifts if it fails.
6325   LegalizerHelper::LegalizeResult Result = lowerFunnelShiftWithInverse(MI);
6326   if (Result == UnableToLegalize)
6327     return lowerFunnelShiftAsShifts(MI);
6328   return Result;
6329 }
6330 
6331 LegalizerHelper::LegalizeResult LegalizerHelper::lowerEXT(MachineInstr &MI) {
6332   auto [Dst, Src] = MI.getFirst2Regs();
6333   LLT DstTy = MRI.getType(Dst);
6334   LLT SrcTy = MRI.getType(Src);
6335 
6336   uint32_t DstTySize = DstTy.getSizeInBits();
6337   uint32_t DstTyScalarSize = DstTy.getScalarSizeInBits();
6338   uint32_t SrcTyScalarSize = SrcTy.getScalarSizeInBits();
6339 
6340   if (!isPowerOf2_32(DstTySize) || !isPowerOf2_32(DstTyScalarSize) ||
6341       !isPowerOf2_32(SrcTyScalarSize))
6342     return UnableToLegalize;
6343 
6344   // The step between extend is too large, split it by creating an intermediate
6345   // extend instruction
6346   if (SrcTyScalarSize * 2 < DstTyScalarSize) {
6347     LLT MidTy = SrcTy.changeElementSize(SrcTyScalarSize * 2);
6348     // If the destination type is illegal, split it into multiple statements
6349     // zext x -> zext(merge(zext(unmerge), zext(unmerge)))
6350     auto NewExt = MIRBuilder.buildInstr(MI.getOpcode(), {MidTy}, {Src});
6351     // Unmerge the vector
6352     LLT EltTy = MidTy.changeElementCount(
6353         MidTy.getElementCount().divideCoefficientBy(2));
6354     auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, NewExt);
6355 
6356     // ZExt the vectors
6357     LLT ZExtResTy = DstTy.changeElementCount(
6358         DstTy.getElementCount().divideCoefficientBy(2));
6359     auto ZExtRes1 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6360                                           {UnmergeSrc.getReg(0)});
6361     auto ZExtRes2 = MIRBuilder.buildInstr(MI.getOpcode(), {ZExtResTy},
6362                                           {UnmergeSrc.getReg(1)});
6363 
6364     // Merge the ending vectors
6365     MIRBuilder.buildMergeLikeInstr(Dst, {ZExtRes1, ZExtRes2});
6366 
6367     MI.eraseFromParent();
6368     return Legalized;
6369   }
6370   return UnableToLegalize;
6371 }
6372 
6373 LegalizerHelper::LegalizeResult LegalizerHelper::lowerTRUNC(MachineInstr &MI) {
6374   // MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
6375   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
6376   // Similar to how operand splitting is done in SelectiondDAG, we can handle
6377   // %res(v8s8) = G_TRUNC %in(v8s32) by generating:
6378   //   %inlo(<4x s32>), %inhi(<4 x s32>) = G_UNMERGE %in(<8 x s32>)
6379   //   %lo16(<4 x s16>) = G_TRUNC %inlo
6380   //   %hi16(<4 x s16>) = G_TRUNC %inhi
6381   //   %in16(<8 x s16>) = G_CONCAT_VECTORS %lo16, %hi16
6382   //   %res(<8 x s8>) = G_TRUNC %in16
6383 
6384   assert(MI.getOpcode() == TargetOpcode::G_TRUNC);
6385 
6386   Register DstReg = MI.getOperand(0).getReg();
6387   Register SrcReg = MI.getOperand(1).getReg();
6388   LLT DstTy = MRI.getType(DstReg);
6389   LLT SrcTy = MRI.getType(SrcReg);
6390 
6391   if (DstTy.isVector() && isPowerOf2_32(DstTy.getNumElements()) &&
6392       isPowerOf2_32(DstTy.getScalarSizeInBits()) &&
6393       isPowerOf2_32(SrcTy.getNumElements()) &&
6394       isPowerOf2_32(SrcTy.getScalarSizeInBits())) {
6395     // Split input type.
6396     LLT SplitSrcTy = SrcTy.changeElementCount(
6397         SrcTy.getElementCount().divideCoefficientBy(2));
6398 
6399     // First, split the source into two smaller vectors.
6400     SmallVector<Register, 2> SplitSrcs;
6401     extractParts(SrcReg, SplitSrcTy, 2, SplitSrcs, MIRBuilder, MRI);
6402 
6403     // Truncate the splits into intermediate narrower elements.
6404     LLT InterTy;
6405     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6406       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits() * 2);
6407     else
6408       InterTy = SplitSrcTy.changeElementSize(DstTy.getScalarSizeInBits());
6409     for (unsigned I = 0; I < SplitSrcs.size(); ++I) {
6410       SplitSrcs[I] = MIRBuilder.buildTrunc(InterTy, SplitSrcs[I]).getReg(0);
6411     }
6412 
6413     // Combine the new truncates into one vector
6414     auto Merge = MIRBuilder.buildMergeLikeInstr(
6415         DstTy.changeElementSize(InterTy.getScalarSizeInBits()), SplitSrcs);
6416 
6417     // Truncate the new vector to the final result type
6418     if (DstTy.getScalarSizeInBits() * 2 < SrcTy.getScalarSizeInBits())
6419       MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), Merge.getReg(0));
6420     else
6421       MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Merge.getReg(0));
6422 
6423     MI.eraseFromParent();
6424 
6425     return Legalized;
6426   }
6427   return UnableToLegalize;
6428 }
6429 
6430 LegalizerHelper::LegalizeResult
6431 LegalizerHelper::lowerRotateWithReverseRotate(MachineInstr &MI) {
6432   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6433   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6434   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6435   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6436   auto Neg = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6437   MIRBuilder.buildInstr(RevRot, {Dst}, {Src, Neg});
6438   MI.eraseFromParent();
6439   return Legalized;
6440 }
6441 
6442 LegalizerHelper::LegalizeResult LegalizerHelper::lowerRotate(MachineInstr &MI) {
6443   auto [Dst, DstTy, Src, SrcTy, Amt, AmtTy] = MI.getFirst3RegLLTs();
6444 
6445   unsigned EltSizeInBits = DstTy.getScalarSizeInBits();
6446   bool IsLeft = MI.getOpcode() == TargetOpcode::G_ROTL;
6447 
6448   MIRBuilder.setInstrAndDebugLoc(MI);
6449 
6450   // If a rotate in the other direction is supported, use it.
6451   unsigned RevRot = IsLeft ? TargetOpcode::G_ROTR : TargetOpcode::G_ROTL;
6452   if (LI.isLegalOrCustom({RevRot, {DstTy, SrcTy}}) &&
6453       isPowerOf2_32(EltSizeInBits))
6454     return lowerRotateWithReverseRotate(MI);
6455 
6456   // If a funnel shift is supported, use it.
6457   unsigned FShOpc = IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6458   unsigned RevFsh = !IsLeft ? TargetOpcode::G_FSHL : TargetOpcode::G_FSHR;
6459   bool IsFShLegal = false;
6460   if ((IsFShLegal = LI.isLegalOrCustom({FShOpc, {DstTy, AmtTy}})) ||
6461       LI.isLegalOrCustom({RevFsh, {DstTy, AmtTy}})) {
6462     auto buildFunnelShift = [&](unsigned Opc, Register R1, Register R2,
6463                                 Register R3) {
6464       MIRBuilder.buildInstr(Opc, {R1}, {R2, R2, R3});
6465       MI.eraseFromParent();
6466       return Legalized;
6467     };
6468     // If a funnel shift in the other direction is supported, use it.
6469     if (IsFShLegal) {
6470       return buildFunnelShift(FShOpc, Dst, Src, Amt);
6471     } else if (isPowerOf2_32(EltSizeInBits)) {
6472       Amt = MIRBuilder.buildNeg(DstTy, Amt).getReg(0);
6473       return buildFunnelShift(RevFsh, Dst, Src, Amt);
6474     }
6475   }
6476 
6477   auto Zero = MIRBuilder.buildConstant(AmtTy, 0);
6478   unsigned ShOpc = IsLeft ? TargetOpcode::G_SHL : TargetOpcode::G_LSHR;
6479   unsigned RevShiftOpc = IsLeft ? TargetOpcode::G_LSHR : TargetOpcode::G_SHL;
6480   auto BitWidthMinusOneC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits - 1);
6481   Register ShVal;
6482   Register RevShiftVal;
6483   if (isPowerOf2_32(EltSizeInBits)) {
6484     // (rotl x, c) -> x << (c & (w - 1)) | x >> (-c & (w - 1))
6485     // (rotr x, c) -> x >> (c & (w - 1)) | x << (-c & (w - 1))
6486     auto NegAmt = MIRBuilder.buildSub(AmtTy, Zero, Amt);
6487     auto ShAmt = MIRBuilder.buildAnd(AmtTy, Amt, BitWidthMinusOneC);
6488     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6489     auto RevAmt = MIRBuilder.buildAnd(AmtTy, NegAmt, BitWidthMinusOneC);
6490     RevShiftVal =
6491         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, RevAmt}).getReg(0);
6492   } else {
6493     // (rotl x, c) -> x << (c % w) | x >> 1 >> (w - 1 - (c % w))
6494     // (rotr x, c) -> x >> (c % w) | x << 1 << (w - 1 - (c % w))
6495     auto BitWidthC = MIRBuilder.buildConstant(AmtTy, EltSizeInBits);
6496     auto ShAmt = MIRBuilder.buildURem(AmtTy, Amt, BitWidthC);
6497     ShVal = MIRBuilder.buildInstr(ShOpc, {DstTy}, {Src, ShAmt}).getReg(0);
6498     auto RevAmt = MIRBuilder.buildSub(AmtTy, BitWidthMinusOneC, ShAmt);
6499     auto One = MIRBuilder.buildConstant(AmtTy, 1);
6500     auto Inner = MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Src, One});
6501     RevShiftVal =
6502         MIRBuilder.buildInstr(RevShiftOpc, {DstTy}, {Inner, RevAmt}).getReg(0);
6503   }
6504   MIRBuilder.buildOr(Dst, ShVal, RevShiftVal);
6505   MI.eraseFromParent();
6506   return Legalized;
6507 }
6508 
6509 // Expand s32 = G_UITOFP s64 using bit operations to an IEEE float
6510 // representation.
6511 LegalizerHelper::LegalizeResult
6512 LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
6513   auto [Dst, Src] = MI.getFirst2Regs();
6514   const LLT S64 = LLT::scalar(64);
6515   const LLT S32 = LLT::scalar(32);
6516   const LLT S1 = LLT::scalar(1);
6517 
6518   assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S32);
6519 
6520   // unsigned cul2f(ulong u) {
6521   //   uint lz = clz(u);
6522   //   uint e = (u != 0) ? 127U + 63U - lz : 0;
6523   //   u = (u << lz) & 0x7fffffffffffffffUL;
6524   //   ulong t = u & 0xffffffffffUL;
6525   //   uint v = (e << 23) | (uint)(u >> 40);
6526   //   uint r = t > 0x8000000000UL ? 1U : (t == 0x8000000000UL ? v & 1U : 0U);
6527   //   return as_float(v + r);
6528   // }
6529 
6530   auto Zero32 = MIRBuilder.buildConstant(S32, 0);
6531   auto Zero64 = MIRBuilder.buildConstant(S64, 0);
6532 
6533   auto LZ = MIRBuilder.buildCTLZ_ZERO_UNDEF(S32, Src);
6534 
6535   auto K = MIRBuilder.buildConstant(S32, 127U + 63U);
6536   auto Sub = MIRBuilder.buildSub(S32, K, LZ);
6537 
6538   auto NotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, Src, Zero64);
6539   auto E = MIRBuilder.buildSelect(S32, NotZero, Sub, Zero32);
6540 
6541   auto Mask0 = MIRBuilder.buildConstant(S64, (-1ULL) >> 1);
6542   auto ShlLZ = MIRBuilder.buildShl(S64, Src, LZ);
6543 
6544   auto U = MIRBuilder.buildAnd(S64, ShlLZ, Mask0);
6545 
6546   auto Mask1 = MIRBuilder.buildConstant(S64, 0xffffffffffULL);
6547   auto T = MIRBuilder.buildAnd(S64, U, Mask1);
6548 
6549   auto UShl = MIRBuilder.buildLShr(S64, U, MIRBuilder.buildConstant(S64, 40));
6550   auto ShlE = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 23));
6551   auto V = MIRBuilder.buildOr(S32, ShlE, MIRBuilder.buildTrunc(S32, UShl));
6552 
6553   auto C = MIRBuilder.buildConstant(S64, 0x8000000000ULL);
6554   auto RCmp = MIRBuilder.buildICmp(CmpInst::ICMP_UGT, S1, T, C);
6555   auto TCmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, T, C);
6556   auto One = MIRBuilder.buildConstant(S32, 1);
6557 
6558   auto VTrunc1 = MIRBuilder.buildAnd(S32, V, One);
6559   auto Select0 = MIRBuilder.buildSelect(S32, TCmp, VTrunc1, Zero32);
6560   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
6561   MIRBuilder.buildAdd(Dst, V, R);
6562 
6563   MI.eraseFromParent();
6564   return Legalized;
6565 }
6566 
6567 LegalizerHelper::LegalizeResult LegalizerHelper::lowerUITOFP(MachineInstr &MI) {
6568   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6569 
6570   if (SrcTy == LLT::scalar(1)) {
6571     auto True = MIRBuilder.buildFConstant(DstTy, 1.0);
6572     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6573     MIRBuilder.buildSelect(Dst, Src, True, False);
6574     MI.eraseFromParent();
6575     return Legalized;
6576   }
6577 
6578   if (SrcTy != LLT::scalar(64))
6579     return UnableToLegalize;
6580 
6581   if (DstTy == LLT::scalar(32)) {
6582     // TODO: SelectionDAG has several alternative expansions to port which may
6583     // be more reasonble depending on the available instructions. If a target
6584     // has sitofp, does not have CTLZ, or can efficiently use f64 as an
6585     // intermediate type, this is probably worse.
6586     return lowerU64ToF32BitOps(MI);
6587   }
6588 
6589   return UnableToLegalize;
6590 }
6591 
6592 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSITOFP(MachineInstr &MI) {
6593   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6594 
6595   const LLT S64 = LLT::scalar(64);
6596   const LLT S32 = LLT::scalar(32);
6597   const LLT S1 = LLT::scalar(1);
6598 
6599   if (SrcTy == S1) {
6600     auto True = MIRBuilder.buildFConstant(DstTy, -1.0);
6601     auto False = MIRBuilder.buildFConstant(DstTy, 0.0);
6602     MIRBuilder.buildSelect(Dst, Src, True, False);
6603     MI.eraseFromParent();
6604     return Legalized;
6605   }
6606 
6607   if (SrcTy != S64)
6608     return UnableToLegalize;
6609 
6610   if (DstTy == S32) {
6611     // signed cl2f(long l) {
6612     //   long s = l >> 63;
6613     //   float r = cul2f((l + s) ^ s);
6614     //   return s ? -r : r;
6615     // }
6616     Register L = Src;
6617     auto SignBit = MIRBuilder.buildConstant(S64, 63);
6618     auto S = MIRBuilder.buildAShr(S64, L, SignBit);
6619 
6620     auto LPlusS = MIRBuilder.buildAdd(S64, L, S);
6621     auto Xor = MIRBuilder.buildXor(S64, LPlusS, S);
6622     auto R = MIRBuilder.buildUITOFP(S32, Xor);
6623 
6624     auto RNeg = MIRBuilder.buildFNeg(S32, R);
6625     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
6626                                             MIRBuilder.buildConstant(S64, 0));
6627     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
6628     MI.eraseFromParent();
6629     return Legalized;
6630   }
6631 
6632   return UnableToLegalize;
6633 }
6634 
6635 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOUI(MachineInstr &MI) {
6636   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6637   const LLT S64 = LLT::scalar(64);
6638   const LLT S32 = LLT::scalar(32);
6639 
6640   if (SrcTy != S64 && SrcTy != S32)
6641     return UnableToLegalize;
6642   if (DstTy != S32 && DstTy != S64)
6643     return UnableToLegalize;
6644 
6645   // FPTOSI gives same result as FPTOUI for positive signed integers.
6646   // FPTOUI needs to deal with fp values that convert to unsigned integers
6647   // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
6648 
6649   APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
6650   APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
6651                                                 : APFloat::IEEEdouble(),
6652                     APInt::getZero(SrcTy.getSizeInBits()));
6653   TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
6654 
6655   MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
6656 
6657   MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
6658   // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
6659   // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
6660   MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
6661   MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
6662   MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
6663   MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
6664 
6665   const LLT S1 = LLT::scalar(1);
6666 
6667   MachineInstrBuilder FCMP =
6668       MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, S1, Src, Threshold);
6669   MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
6670 
6671   MI.eraseFromParent();
6672   return Legalized;
6673 }
6674 
6675 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
6676   auto [Dst, DstTy, Src, SrcTy] = MI.getFirst2RegLLTs();
6677   const LLT S64 = LLT::scalar(64);
6678   const LLT S32 = LLT::scalar(32);
6679 
6680   // FIXME: Only f32 to i64 conversions are supported.
6681   if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
6682     return UnableToLegalize;
6683 
6684   // Expand f32 -> i64 conversion
6685   // This algorithm comes from compiler-rt's implementation of fixsfdi:
6686   // https://github.com/llvm/llvm-project/blob/main/compiler-rt/lib/builtins/fixsfdi.c
6687 
6688   unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
6689 
6690   auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
6691   auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
6692 
6693   auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
6694   auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
6695 
6696   auto SignMask = MIRBuilder.buildConstant(SrcTy,
6697                                            APInt::getSignMask(SrcEltBits));
6698   auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
6699   auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
6700   auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
6701   Sign = MIRBuilder.buildSExt(DstTy, Sign);
6702 
6703   auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
6704   auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
6705   auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
6706 
6707   auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
6708   R = MIRBuilder.buildZExt(DstTy, R);
6709 
6710   auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
6711   auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
6712   auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
6713   auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
6714 
6715   auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
6716   auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
6717 
6718   const LLT S1 = LLT::scalar(1);
6719   auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
6720                                     S1, Exponent, ExponentLoBit);
6721 
6722   R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
6723 
6724   auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
6725   auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
6726 
6727   auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
6728 
6729   auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
6730                                           S1, Exponent, ZeroSrcTy);
6731 
6732   auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
6733   MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
6734 
6735   MI.eraseFromParent();
6736   return Legalized;
6737 }
6738 
6739 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
6740 LegalizerHelper::LegalizeResult
6741 LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
6742   const LLT S1 = LLT::scalar(1);
6743   const LLT S32 = LLT::scalar(32);
6744 
6745   auto [Dst, Src] = MI.getFirst2Regs();
6746   assert(MRI.getType(Dst).getScalarType() == LLT::scalar(16) &&
6747          MRI.getType(Src).getScalarType() == LLT::scalar(64));
6748 
6749   if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
6750     return UnableToLegalize;
6751 
6752   if (MIRBuilder.getMF().getTarget().Options.UnsafeFPMath) {
6753     unsigned Flags = MI.getFlags();
6754     auto Src32 = MIRBuilder.buildFPTrunc(S32, Src, Flags);
6755     MIRBuilder.buildFPTrunc(Dst, Src32, Flags);
6756     MI.eraseFromParent();
6757     return Legalized;
6758   }
6759 
6760   const unsigned ExpMask = 0x7ff;
6761   const unsigned ExpBiasf64 = 1023;
6762   const unsigned ExpBiasf16 = 15;
6763 
6764   auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
6765   Register U = Unmerge.getReg(0);
6766   Register UH = Unmerge.getReg(1);
6767 
6768   auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
6769   E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
6770 
6771   // Subtract the fp64 exponent bias (1023) to get the real exponent and
6772   // add the f16 bias (15) to get the biased exponent for the f16 format.
6773   E = MIRBuilder.buildAdd(
6774     S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
6775 
6776   auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
6777   M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
6778 
6779   auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
6780                                        MIRBuilder.buildConstant(S32, 0x1ff));
6781   MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
6782 
6783   auto Zero = MIRBuilder.buildConstant(S32, 0);
6784   auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
6785   auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
6786   M = MIRBuilder.buildOr(S32, M, Lo40Set);
6787 
6788   // (M != 0 ? 0x0200 : 0) | 0x7c00;
6789   auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
6790   auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
6791   auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
6792 
6793   auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
6794   auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
6795 
6796   // N = M | (E << 12);
6797   auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
6798   auto N = MIRBuilder.buildOr(S32, M, EShl12);
6799 
6800   // B = clamp(1-E, 0, 13);
6801   auto One = MIRBuilder.buildConstant(S32, 1);
6802   auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
6803   auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
6804   B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
6805 
6806   auto SigSetHigh = MIRBuilder.buildOr(S32, M,
6807                                        MIRBuilder.buildConstant(S32, 0x1000));
6808 
6809   auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
6810   auto D0 = MIRBuilder.buildShl(S32, D, B);
6811 
6812   auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
6813                                              D0, SigSetHigh);
6814   auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
6815   D = MIRBuilder.buildOr(S32, D, D1);
6816 
6817   auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
6818   auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
6819 
6820   auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
6821   V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
6822 
6823   auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
6824                                        MIRBuilder.buildConstant(S32, 3));
6825   auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
6826 
6827   auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
6828                                        MIRBuilder.buildConstant(S32, 5));
6829   auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
6830 
6831   V1 = MIRBuilder.buildOr(S32, V0, V1);
6832   V = MIRBuilder.buildAdd(S32, V, V1);
6833 
6834   auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
6835                                        E, MIRBuilder.buildConstant(S32, 30));
6836   V = MIRBuilder.buildSelect(S32, CmpEGt30,
6837                              MIRBuilder.buildConstant(S32, 0x7c00), V);
6838 
6839   auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
6840                                          E, MIRBuilder.buildConstant(S32, 1039));
6841   V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
6842 
6843   // Extract the sign bit.
6844   auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
6845   Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
6846 
6847   // Insert the sign bit
6848   V = MIRBuilder.buildOr(S32, Sign, V);
6849 
6850   MIRBuilder.buildTrunc(Dst, V);
6851   MI.eraseFromParent();
6852   return Legalized;
6853 }
6854 
6855 LegalizerHelper::LegalizeResult
6856 LegalizerHelper::lowerFPTRUNC(MachineInstr &MI) {
6857   auto [DstTy, SrcTy] = MI.getFirst2LLTs();
6858   const LLT S64 = LLT::scalar(64);
6859   const LLT S16 = LLT::scalar(16);
6860 
6861   if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
6862     return lowerFPTRUNC_F64_TO_F16(MI);
6863 
6864   return UnableToLegalize;
6865 }
6866 
6867 // TODO: If RHS is a constant SelectionDAGBuilder expands this into a
6868 // multiplication tree.
6869 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPOWI(MachineInstr &MI) {
6870   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6871   LLT Ty = MRI.getType(Dst);
6872 
6873   auto CvtSrc1 = MIRBuilder.buildSITOFP(Ty, Src1);
6874   MIRBuilder.buildFPow(Dst, Src0, CvtSrc1, MI.getFlags());
6875   MI.eraseFromParent();
6876   return Legalized;
6877 }
6878 
6879 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
6880   switch (Opc) {
6881   case TargetOpcode::G_SMIN:
6882     return CmpInst::ICMP_SLT;
6883   case TargetOpcode::G_SMAX:
6884     return CmpInst::ICMP_SGT;
6885   case TargetOpcode::G_UMIN:
6886     return CmpInst::ICMP_ULT;
6887   case TargetOpcode::G_UMAX:
6888     return CmpInst::ICMP_UGT;
6889   default:
6890     llvm_unreachable("not in integer min/max");
6891   }
6892 }
6893 
6894 LegalizerHelper::LegalizeResult LegalizerHelper::lowerMinMax(MachineInstr &MI) {
6895   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6896 
6897   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
6898   LLT CmpType = MRI.getType(Dst).changeElementSize(1);
6899 
6900   auto Cmp = MIRBuilder.buildICmp(Pred, CmpType, Src0, Src1);
6901   MIRBuilder.buildSelect(Dst, Cmp, Src0, Src1);
6902 
6903   MI.eraseFromParent();
6904   return Legalized;
6905 }
6906 
6907 LegalizerHelper::LegalizeResult
6908 LegalizerHelper::lowerFCopySign(MachineInstr &MI) {
6909   auto [Dst, DstTy, Src0, Src0Ty, Src1, Src1Ty] = MI.getFirst3RegLLTs();
6910   const int Src0Size = Src0Ty.getScalarSizeInBits();
6911   const int Src1Size = Src1Ty.getScalarSizeInBits();
6912 
6913   auto SignBitMask = MIRBuilder.buildConstant(
6914     Src0Ty, APInt::getSignMask(Src0Size));
6915 
6916   auto NotSignBitMask = MIRBuilder.buildConstant(
6917     Src0Ty, APInt::getLowBitsSet(Src0Size, Src0Size - 1));
6918 
6919   Register And0 = MIRBuilder.buildAnd(Src0Ty, Src0, NotSignBitMask).getReg(0);
6920   Register And1;
6921   if (Src0Ty == Src1Ty) {
6922     And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask).getReg(0);
6923   } else if (Src0Size > Src1Size) {
6924     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
6925     auto Zext = MIRBuilder.buildZExt(Src0Ty, Src1);
6926     auto Shift = MIRBuilder.buildShl(Src0Ty, Zext, ShiftAmt);
6927     And1 = MIRBuilder.buildAnd(Src0Ty, Shift, SignBitMask).getReg(0);
6928   } else {
6929     auto ShiftAmt = MIRBuilder.buildConstant(Src1Ty, Src1Size - Src0Size);
6930     auto Shift = MIRBuilder.buildLShr(Src1Ty, Src1, ShiftAmt);
6931     auto Trunc = MIRBuilder.buildTrunc(Src0Ty, Shift);
6932     And1 = MIRBuilder.buildAnd(Src0Ty, Trunc, SignBitMask).getReg(0);
6933   }
6934 
6935   // Be careful about setting nsz/nnan/ninf on every instruction, since the
6936   // constants are a nan and -0.0, but the final result should preserve
6937   // everything.
6938   unsigned Flags = MI.getFlags();
6939   MIRBuilder.buildOr(Dst, And0, And1, Flags);
6940 
6941   MI.eraseFromParent();
6942   return Legalized;
6943 }
6944 
6945 LegalizerHelper::LegalizeResult
6946 LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
6947   unsigned NewOp = MI.getOpcode() == TargetOpcode::G_FMINNUM ?
6948     TargetOpcode::G_FMINNUM_IEEE : TargetOpcode::G_FMAXNUM_IEEE;
6949 
6950   auto [Dst, Src0, Src1] = MI.getFirst3Regs();
6951   LLT Ty = MRI.getType(Dst);
6952 
6953   if (!MI.getFlag(MachineInstr::FmNoNans)) {
6954     // Insert canonicalizes if it's possible we need to quiet to get correct
6955     // sNaN behavior.
6956 
6957     // Note this must be done here, and not as an optimization combine in the
6958     // absence of a dedicate quiet-snan instruction as we're using an
6959     // omni-purpose G_FCANONICALIZE.
6960     if (!isKnownNeverSNaN(Src0, MRI))
6961       Src0 = MIRBuilder.buildFCanonicalize(Ty, Src0, MI.getFlags()).getReg(0);
6962 
6963     if (!isKnownNeverSNaN(Src1, MRI))
6964       Src1 = MIRBuilder.buildFCanonicalize(Ty, Src1, MI.getFlags()).getReg(0);
6965   }
6966 
6967   // If there are no nans, it's safe to simply replace this with the non-IEEE
6968   // version.
6969   MIRBuilder.buildInstr(NewOp, {Dst}, {Src0, Src1}, MI.getFlags());
6970   MI.eraseFromParent();
6971   return Legalized;
6972 }
6973 
6974 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
6975   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
6976   Register DstReg = MI.getOperand(0).getReg();
6977   LLT Ty = MRI.getType(DstReg);
6978   unsigned Flags = MI.getFlags();
6979 
6980   auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
6981                                   Flags);
6982   MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
6983   MI.eraseFromParent();
6984   return Legalized;
6985 }
6986 
6987 LegalizerHelper::LegalizeResult
6988 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
6989   auto [DstReg, X] = MI.getFirst2Regs();
6990   const unsigned Flags = MI.getFlags();
6991   const LLT Ty = MRI.getType(DstReg);
6992   const LLT CondTy = Ty.changeElementSize(1);
6993 
6994   // round(x) =>
6995   //  t = trunc(x);
6996   //  d = fabs(x - t);
6997   //  o = copysign(d >= 0.5 ? 1.0 : 0.0, x);
6998   //  return t + o;
6999 
7000   auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
7001 
7002   auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
7003   auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
7004 
7005   auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
7006   auto Cmp =
7007       MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half, Flags);
7008 
7009   // Could emit G_UITOFP instead
7010   auto One = MIRBuilder.buildFConstant(Ty, 1.0);
7011   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7012   auto BoolFP = MIRBuilder.buildSelect(Ty, Cmp, One, Zero);
7013   auto SignedOffset = MIRBuilder.buildFCopysign(Ty, BoolFP, X);
7014 
7015   MIRBuilder.buildFAdd(DstReg, T, SignedOffset, Flags);
7016 
7017   MI.eraseFromParent();
7018   return Legalized;
7019 }
7020 
7021 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFFloor(MachineInstr &MI) {
7022   auto [DstReg, SrcReg] = MI.getFirst2Regs();
7023   unsigned Flags = MI.getFlags();
7024   LLT Ty = MRI.getType(DstReg);
7025   const LLT CondTy = Ty.changeElementSize(1);
7026 
7027   // result = trunc(src);
7028   // if (src < 0.0 && src != result)
7029   //   result += -1.0.
7030 
7031   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
7032   auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
7033 
7034   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
7035                                   SrcReg, Zero, Flags);
7036   auto NeTrunc = MIRBuilder.buildFCmp(CmpInst::FCMP_ONE, CondTy,
7037                                       SrcReg, Trunc, Flags);
7038   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
7039   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
7040 
7041   MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
7042   MI.eraseFromParent();
7043   return Legalized;
7044 }
7045 
7046 LegalizerHelper::LegalizeResult
7047 LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
7048   const unsigned NumOps = MI.getNumOperands();
7049   auto [DstReg, DstTy, Src0Reg, Src0Ty] = MI.getFirst2RegLLTs();
7050   unsigned PartSize = Src0Ty.getSizeInBits();
7051 
7052   LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
7053   Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
7054 
7055   for (unsigned I = 2; I != NumOps; ++I) {
7056     const unsigned Offset = (I - 1) * PartSize;
7057 
7058     Register SrcReg = MI.getOperand(I).getReg();
7059     auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
7060 
7061     Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
7062       MRI.createGenericVirtualRegister(WideTy);
7063 
7064     auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
7065     auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
7066     MIRBuilder.buildOr(NextResult, ResultReg, Shl);
7067     ResultReg = NextResult;
7068   }
7069 
7070   if (DstTy.isPointer()) {
7071     if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
7072           DstTy.getAddressSpace())) {
7073       LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
7074       return UnableToLegalize;
7075     }
7076 
7077     MIRBuilder.buildIntToPtr(DstReg, ResultReg);
7078   }
7079 
7080   MI.eraseFromParent();
7081   return Legalized;
7082 }
7083 
7084 LegalizerHelper::LegalizeResult
7085 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
7086   const unsigned NumDst = MI.getNumOperands() - 1;
7087   Register SrcReg = MI.getOperand(NumDst).getReg();
7088   Register Dst0Reg = MI.getOperand(0).getReg();
7089   LLT DstTy = MRI.getType(Dst0Reg);
7090   if (DstTy.isPointer())
7091     return UnableToLegalize; // TODO
7092 
7093   SrcReg = coerceToScalar(SrcReg);
7094   if (!SrcReg)
7095     return UnableToLegalize;
7096 
7097   // Expand scalarizing unmerge as bitcast to integer and shift.
7098   LLT IntTy = MRI.getType(SrcReg);
7099 
7100   MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
7101 
7102   const unsigned DstSize = DstTy.getSizeInBits();
7103   unsigned Offset = DstSize;
7104   for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
7105     auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
7106     auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
7107     MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
7108   }
7109 
7110   MI.eraseFromParent();
7111   return Legalized;
7112 }
7113 
7114 /// Lower a vector extract or insert by writing the vector to a stack temporary
7115 /// and reloading the element or vector.
7116 ///
7117 /// %dst = G_EXTRACT_VECTOR_ELT %vec, %idx
7118 ///  =>
7119 ///  %stack_temp = G_FRAME_INDEX
7120 ///  G_STORE %vec, %stack_temp
7121 ///  %idx = clamp(%idx, %vec.getNumElements())
7122 ///  %element_ptr = G_PTR_ADD %stack_temp, %idx
7123 ///  %dst = G_LOAD %element_ptr
7124 LegalizerHelper::LegalizeResult
7125 LegalizerHelper::lowerExtractInsertVectorElt(MachineInstr &MI) {
7126   Register DstReg = MI.getOperand(0).getReg();
7127   Register SrcVec = MI.getOperand(1).getReg();
7128   Register InsertVal;
7129   if (MI.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT)
7130     InsertVal = MI.getOperand(2).getReg();
7131 
7132   Register Idx = MI.getOperand(MI.getNumOperands() - 1).getReg();
7133 
7134   LLT VecTy = MRI.getType(SrcVec);
7135   LLT EltTy = VecTy.getElementType();
7136   unsigned NumElts = VecTy.getNumElements();
7137 
7138   int64_t IdxVal;
7139   if (mi_match(Idx, MRI, m_ICst(IdxVal)) && IdxVal <= NumElts) {
7140     SmallVector<Register, 8> SrcRegs;
7141     extractParts(SrcVec, EltTy, NumElts, SrcRegs, MIRBuilder, MRI);
7142 
7143     if (InsertVal) {
7144       SrcRegs[IdxVal] = MI.getOperand(2).getReg();
7145       MIRBuilder.buildMergeLikeInstr(DstReg, SrcRegs);
7146     } else {
7147       MIRBuilder.buildCopy(DstReg, SrcRegs[IdxVal]);
7148     }
7149 
7150     MI.eraseFromParent();
7151     return Legalized;
7152   }
7153 
7154   if (!EltTy.isByteSized()) { // Not implemented.
7155     LLVM_DEBUG(dbgs() << "Can't handle non-byte element vectors yet\n");
7156     return UnableToLegalize;
7157   }
7158 
7159   unsigned EltBytes = EltTy.getSizeInBytes();
7160   Align VecAlign = getStackTemporaryAlignment(VecTy);
7161   Align EltAlign;
7162 
7163   MachinePointerInfo PtrInfo;
7164   auto StackTemp = createStackTemporary(
7165       TypeSize::getFixed(VecTy.getSizeInBytes()), VecAlign, PtrInfo);
7166   MIRBuilder.buildStore(SrcVec, StackTemp, PtrInfo, VecAlign);
7167 
7168   // Get the pointer to the element, and be sure not to hit undefined behavior
7169   // if the index is out of bounds.
7170   Register EltPtr = getVectorElementPointer(StackTemp.getReg(0), VecTy, Idx);
7171 
7172   if (mi_match(Idx, MRI, m_ICst(IdxVal))) {
7173     int64_t Offset = IdxVal * EltBytes;
7174     PtrInfo = PtrInfo.getWithOffset(Offset);
7175     EltAlign = commonAlignment(VecAlign, Offset);
7176   } else {
7177     // We lose information with a variable offset.
7178     EltAlign = getStackTemporaryAlignment(EltTy);
7179     PtrInfo = MachinePointerInfo(MRI.getType(EltPtr).getAddressSpace());
7180   }
7181 
7182   if (InsertVal) {
7183     // Write the inserted element
7184     MIRBuilder.buildStore(InsertVal, EltPtr, PtrInfo, EltAlign);
7185 
7186     // Reload the whole vector.
7187     MIRBuilder.buildLoad(DstReg, StackTemp, PtrInfo, VecAlign);
7188   } else {
7189     MIRBuilder.buildLoad(DstReg, EltPtr, PtrInfo, EltAlign);
7190   }
7191 
7192   MI.eraseFromParent();
7193   return Legalized;
7194 }
7195 
7196 LegalizerHelper::LegalizeResult
7197 LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
7198   auto [DstReg, DstTy, Src0Reg, Src0Ty, Src1Reg, Src1Ty] =
7199       MI.getFirst3RegLLTs();
7200   LLT IdxTy = LLT::scalar(32);
7201 
7202   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
7203   Register Undef;
7204   SmallVector<Register, 32> BuildVec;
7205   LLT EltTy = DstTy.getScalarType();
7206 
7207   for (int Idx : Mask) {
7208     if (Idx < 0) {
7209       if (!Undef.isValid())
7210         Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
7211       BuildVec.push_back(Undef);
7212       continue;
7213     }
7214 
7215     if (Src0Ty.isScalar()) {
7216       BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
7217     } else {
7218       int NumElts = Src0Ty.getNumElements();
7219       Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
7220       int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
7221       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
7222       auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
7223       BuildVec.push_back(Extract.getReg(0));
7224     }
7225   }
7226 
7227   if (DstTy.isScalar())
7228     MIRBuilder.buildCopy(DstReg, BuildVec[0]);
7229   else
7230     MIRBuilder.buildBuildVector(DstReg, BuildVec);
7231   MI.eraseFromParent();
7232   return Legalized;
7233 }
7234 
7235 Register LegalizerHelper::getDynStackAllocTargetPtr(Register SPReg,
7236                                                     Register AllocSize,
7237                                                     Align Alignment,
7238                                                     LLT PtrTy) {
7239   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
7240 
7241   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
7242   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
7243 
7244   // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
7245   // have to generate an extra instruction to negate the alloc and then use
7246   // G_PTR_ADD to add the negative offset.
7247   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
7248   if (Alignment > Align(1)) {
7249     APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
7250     AlignMask.negate();
7251     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
7252     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
7253   }
7254 
7255   return MIRBuilder.buildCast(PtrTy, Alloc).getReg(0);
7256 }
7257 
7258 LegalizerHelper::LegalizeResult
7259 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
7260   const auto &MF = *MI.getMF();
7261   const auto &TFI = *MF.getSubtarget().getFrameLowering();
7262   if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
7263     return UnableToLegalize;
7264 
7265   Register Dst = MI.getOperand(0).getReg();
7266   Register AllocSize = MI.getOperand(1).getReg();
7267   Align Alignment = assumeAligned(MI.getOperand(2).getImm());
7268 
7269   LLT PtrTy = MRI.getType(Dst);
7270   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
7271   Register SPTmp =
7272       getDynStackAllocTargetPtr(SPReg, AllocSize, Alignment, PtrTy);
7273 
7274   MIRBuilder.buildCopy(SPReg, SPTmp);
7275   MIRBuilder.buildCopy(Dst, SPTmp);
7276 
7277   MI.eraseFromParent();
7278   return Legalized;
7279 }
7280 
7281 LegalizerHelper::LegalizeResult
7282 LegalizerHelper::lowerStackSave(MachineInstr &MI) {
7283   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7284   if (!StackPtr)
7285     return UnableToLegalize;
7286 
7287   MIRBuilder.buildCopy(MI.getOperand(0), StackPtr);
7288   MI.eraseFromParent();
7289   return Legalized;
7290 }
7291 
7292 LegalizerHelper::LegalizeResult
7293 LegalizerHelper::lowerStackRestore(MachineInstr &MI) {
7294   Register StackPtr = TLI.getStackPointerRegisterToSaveRestore();
7295   if (!StackPtr)
7296     return UnableToLegalize;
7297 
7298   MIRBuilder.buildCopy(StackPtr, MI.getOperand(0));
7299   MI.eraseFromParent();
7300   return Legalized;
7301 }
7302 
7303 LegalizerHelper::LegalizeResult
7304 LegalizerHelper::lowerExtract(MachineInstr &MI) {
7305   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7306   unsigned Offset = MI.getOperand(2).getImm();
7307 
7308   // Extract sub-vector or one element
7309   if (SrcTy.isVector()) {
7310     unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
7311     unsigned DstSize = DstTy.getSizeInBits();
7312 
7313     if ((Offset % SrcEltSize == 0) && (DstSize % SrcEltSize == 0) &&
7314         (Offset + DstSize <= SrcTy.getSizeInBits())) {
7315       // Unmerge and allow access to each Src element for the artifact combiner.
7316       auto Unmerge = MIRBuilder.buildUnmerge(SrcTy.getElementType(), SrcReg);
7317 
7318       // Take element(s) we need to extract and copy it (merge them).
7319       SmallVector<Register, 8> SubVectorElts;
7320       for (unsigned Idx = Offset / SrcEltSize;
7321            Idx < (Offset + DstSize) / SrcEltSize; ++Idx) {
7322         SubVectorElts.push_back(Unmerge.getReg(Idx));
7323       }
7324       if (SubVectorElts.size() == 1)
7325         MIRBuilder.buildCopy(DstReg, SubVectorElts[0]);
7326       else
7327         MIRBuilder.buildMergeLikeInstr(DstReg, SubVectorElts);
7328 
7329       MI.eraseFromParent();
7330       return Legalized;
7331     }
7332   }
7333 
7334   if (DstTy.isScalar() &&
7335       (SrcTy.isScalar() ||
7336        (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
7337     LLT SrcIntTy = SrcTy;
7338     if (!SrcTy.isScalar()) {
7339       SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
7340       SrcReg = MIRBuilder.buildBitcast(SrcIntTy, SrcReg).getReg(0);
7341     }
7342 
7343     if (Offset == 0)
7344       MIRBuilder.buildTrunc(DstReg, SrcReg);
7345     else {
7346       auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
7347       auto Shr = MIRBuilder.buildLShr(SrcIntTy, SrcReg, ShiftAmt);
7348       MIRBuilder.buildTrunc(DstReg, Shr);
7349     }
7350 
7351     MI.eraseFromParent();
7352     return Legalized;
7353   }
7354 
7355   return UnableToLegalize;
7356 }
7357 
7358 LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
7359   auto [Dst, Src, InsertSrc] = MI.getFirst3Regs();
7360   uint64_t Offset = MI.getOperand(3).getImm();
7361 
7362   LLT DstTy = MRI.getType(Src);
7363   LLT InsertTy = MRI.getType(InsertSrc);
7364 
7365   // Insert sub-vector or one element
7366   if (DstTy.isVector() && !InsertTy.isPointer()) {
7367     LLT EltTy = DstTy.getElementType();
7368     unsigned EltSize = EltTy.getSizeInBits();
7369     unsigned InsertSize = InsertTy.getSizeInBits();
7370 
7371     if ((Offset % EltSize == 0) && (InsertSize % EltSize == 0) &&
7372         (Offset + InsertSize <= DstTy.getSizeInBits())) {
7373       auto UnmergeSrc = MIRBuilder.buildUnmerge(EltTy, Src);
7374       SmallVector<Register, 8> DstElts;
7375       unsigned Idx = 0;
7376       // Elements from Src before insert start Offset
7377       for (; Idx < Offset / EltSize; ++Idx) {
7378         DstElts.push_back(UnmergeSrc.getReg(Idx));
7379       }
7380 
7381       // Replace elements in Src with elements from InsertSrc
7382       if (InsertTy.getSizeInBits() > EltSize) {
7383         auto UnmergeInsertSrc = MIRBuilder.buildUnmerge(EltTy, InsertSrc);
7384         for (unsigned i = 0; Idx < (Offset + InsertSize) / EltSize;
7385              ++Idx, ++i) {
7386           DstElts.push_back(UnmergeInsertSrc.getReg(i));
7387         }
7388       } else {
7389         DstElts.push_back(InsertSrc);
7390         ++Idx;
7391       }
7392 
7393       // Remaining elements from Src after insert
7394       for (; Idx < DstTy.getNumElements(); ++Idx) {
7395         DstElts.push_back(UnmergeSrc.getReg(Idx));
7396       }
7397 
7398       MIRBuilder.buildMergeLikeInstr(Dst, DstElts);
7399       MI.eraseFromParent();
7400       return Legalized;
7401     }
7402   }
7403 
7404   if (InsertTy.isVector() ||
7405       (DstTy.isVector() && DstTy.getElementType() != InsertTy))
7406     return UnableToLegalize;
7407 
7408   const DataLayout &DL = MIRBuilder.getDataLayout();
7409   if ((DstTy.isPointer() &&
7410        DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
7411       (InsertTy.isPointer() &&
7412        DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
7413     LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
7414     return UnableToLegalize;
7415   }
7416 
7417   LLT IntDstTy = DstTy;
7418 
7419   if (!DstTy.isScalar()) {
7420     IntDstTy = LLT::scalar(DstTy.getSizeInBits());
7421     Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
7422   }
7423 
7424   if (!InsertTy.isScalar()) {
7425     const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
7426     InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
7427   }
7428 
7429   Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
7430   if (Offset != 0) {
7431     auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
7432     ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
7433   }
7434 
7435   APInt MaskVal = APInt::getBitsSetWithWrap(
7436       DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
7437 
7438   auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
7439   auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
7440   auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
7441 
7442   MIRBuilder.buildCast(Dst, Or);
7443   MI.eraseFromParent();
7444   return Legalized;
7445 }
7446 
7447 LegalizerHelper::LegalizeResult
7448 LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
7449   auto [Dst0, Dst0Ty, Dst1, Dst1Ty, LHS, LHSTy, RHS, RHSTy] =
7450       MI.getFirst4RegLLTs();
7451   const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
7452 
7453   LLT Ty = Dst0Ty;
7454   LLT BoolTy = Dst1Ty;
7455 
7456   if (IsAdd)
7457     MIRBuilder.buildAdd(Dst0, LHS, RHS);
7458   else
7459     MIRBuilder.buildSub(Dst0, LHS, RHS);
7460 
7461   // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
7462 
7463   auto Zero = MIRBuilder.buildConstant(Ty, 0);
7464 
7465   // For an addition, the result should be less than one of the operands (LHS)
7466   // if and only if the other operand (RHS) is negative, otherwise there will
7467   // be overflow.
7468   // For a subtraction, the result should be less than one of the operands
7469   // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
7470   // otherwise there will be overflow.
7471   auto ResultLowerThanLHS =
7472       MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
7473   auto ConditionRHS = MIRBuilder.buildICmp(
7474       IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
7475 
7476   MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
7477   MI.eraseFromParent();
7478   return Legalized;
7479 }
7480 
7481 LegalizerHelper::LegalizeResult
7482 LegalizerHelper::lowerAddSubSatToMinMax(MachineInstr &MI) {
7483   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7484   LLT Ty = MRI.getType(Res);
7485   bool IsSigned;
7486   bool IsAdd;
7487   unsigned BaseOp;
7488   switch (MI.getOpcode()) {
7489   default:
7490     llvm_unreachable("unexpected addsat/subsat opcode");
7491   case TargetOpcode::G_UADDSAT:
7492     IsSigned = false;
7493     IsAdd = true;
7494     BaseOp = TargetOpcode::G_ADD;
7495     break;
7496   case TargetOpcode::G_SADDSAT:
7497     IsSigned = true;
7498     IsAdd = true;
7499     BaseOp = TargetOpcode::G_ADD;
7500     break;
7501   case TargetOpcode::G_USUBSAT:
7502     IsSigned = false;
7503     IsAdd = false;
7504     BaseOp = TargetOpcode::G_SUB;
7505     break;
7506   case TargetOpcode::G_SSUBSAT:
7507     IsSigned = true;
7508     IsAdd = false;
7509     BaseOp = TargetOpcode::G_SUB;
7510     break;
7511   }
7512 
7513   if (IsSigned) {
7514     // sadd.sat(a, b) ->
7515     //   hi = 0x7fffffff - smax(a, 0)
7516     //   lo = 0x80000000 - smin(a, 0)
7517     //   a + smin(smax(lo, b), hi)
7518     // ssub.sat(a, b) ->
7519     //   lo = smax(a, -1) - 0x7fffffff
7520     //   hi = smin(a, -1) - 0x80000000
7521     //   a - smin(smax(lo, b), hi)
7522     // TODO: AMDGPU can use a "median of 3" instruction here:
7523     //   a +/- med3(lo, b, hi)
7524     uint64_t NumBits = Ty.getScalarSizeInBits();
7525     auto MaxVal =
7526         MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(NumBits));
7527     auto MinVal =
7528         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7529     MachineInstrBuilder Hi, Lo;
7530     if (IsAdd) {
7531       auto Zero = MIRBuilder.buildConstant(Ty, 0);
7532       Hi = MIRBuilder.buildSub(Ty, MaxVal, MIRBuilder.buildSMax(Ty, LHS, Zero));
7533       Lo = MIRBuilder.buildSub(Ty, MinVal, MIRBuilder.buildSMin(Ty, LHS, Zero));
7534     } else {
7535       auto NegOne = MIRBuilder.buildConstant(Ty, -1);
7536       Lo = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMax(Ty, LHS, NegOne),
7537                                MaxVal);
7538       Hi = MIRBuilder.buildSub(Ty, MIRBuilder.buildSMin(Ty, LHS, NegOne),
7539                                MinVal);
7540     }
7541     auto RHSClamped =
7542         MIRBuilder.buildSMin(Ty, MIRBuilder.buildSMax(Ty, Lo, RHS), Hi);
7543     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, RHSClamped});
7544   } else {
7545     // uadd.sat(a, b) -> a + umin(~a, b)
7546     // usub.sat(a, b) -> a - umin(a, b)
7547     Register Not = IsAdd ? MIRBuilder.buildNot(Ty, LHS).getReg(0) : LHS;
7548     auto Min = MIRBuilder.buildUMin(Ty, Not, RHS);
7549     MIRBuilder.buildInstr(BaseOp, {Res}, {LHS, Min});
7550   }
7551 
7552   MI.eraseFromParent();
7553   return Legalized;
7554 }
7555 
7556 LegalizerHelper::LegalizeResult
7557 LegalizerHelper::lowerAddSubSatToAddoSubo(MachineInstr &MI) {
7558   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7559   LLT Ty = MRI.getType(Res);
7560   LLT BoolTy = Ty.changeElementSize(1);
7561   bool IsSigned;
7562   bool IsAdd;
7563   unsigned OverflowOp;
7564   switch (MI.getOpcode()) {
7565   default:
7566     llvm_unreachable("unexpected addsat/subsat opcode");
7567   case TargetOpcode::G_UADDSAT:
7568     IsSigned = false;
7569     IsAdd = true;
7570     OverflowOp = TargetOpcode::G_UADDO;
7571     break;
7572   case TargetOpcode::G_SADDSAT:
7573     IsSigned = true;
7574     IsAdd = true;
7575     OverflowOp = TargetOpcode::G_SADDO;
7576     break;
7577   case TargetOpcode::G_USUBSAT:
7578     IsSigned = false;
7579     IsAdd = false;
7580     OverflowOp = TargetOpcode::G_USUBO;
7581     break;
7582   case TargetOpcode::G_SSUBSAT:
7583     IsSigned = true;
7584     IsAdd = false;
7585     OverflowOp = TargetOpcode::G_SSUBO;
7586     break;
7587   }
7588 
7589   auto OverflowRes =
7590       MIRBuilder.buildInstr(OverflowOp, {Ty, BoolTy}, {LHS, RHS});
7591   Register Tmp = OverflowRes.getReg(0);
7592   Register Ov = OverflowRes.getReg(1);
7593   MachineInstrBuilder Clamp;
7594   if (IsSigned) {
7595     // sadd.sat(a, b) ->
7596     //   {tmp, ov} = saddo(a, b)
7597     //   ov ? (tmp >>s 31) + 0x80000000 : r
7598     // ssub.sat(a, b) ->
7599     //   {tmp, ov} = ssubo(a, b)
7600     //   ov ? (tmp >>s 31) + 0x80000000 : r
7601     uint64_t NumBits = Ty.getScalarSizeInBits();
7602     auto ShiftAmount = MIRBuilder.buildConstant(Ty, NumBits - 1);
7603     auto Sign = MIRBuilder.buildAShr(Ty, Tmp, ShiftAmount);
7604     auto MinVal =
7605         MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(NumBits));
7606     Clamp = MIRBuilder.buildAdd(Ty, Sign, MinVal);
7607   } else {
7608     // uadd.sat(a, b) ->
7609     //   {tmp, ov} = uaddo(a, b)
7610     //   ov ? 0xffffffff : tmp
7611     // usub.sat(a, b) ->
7612     //   {tmp, ov} = usubo(a, b)
7613     //   ov ? 0 : tmp
7614     Clamp = MIRBuilder.buildConstant(Ty, IsAdd ? -1 : 0);
7615   }
7616   MIRBuilder.buildSelect(Res, Ov, Clamp, Tmp);
7617 
7618   MI.eraseFromParent();
7619   return Legalized;
7620 }
7621 
7622 LegalizerHelper::LegalizeResult
7623 LegalizerHelper::lowerShlSat(MachineInstr &MI) {
7624   assert((MI.getOpcode() == TargetOpcode::G_SSHLSAT ||
7625           MI.getOpcode() == TargetOpcode::G_USHLSAT) &&
7626          "Expected shlsat opcode!");
7627   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SSHLSAT;
7628   auto [Res, LHS, RHS] = MI.getFirst3Regs();
7629   LLT Ty = MRI.getType(Res);
7630   LLT BoolTy = Ty.changeElementSize(1);
7631 
7632   unsigned BW = Ty.getScalarSizeInBits();
7633   auto Result = MIRBuilder.buildShl(Ty, LHS, RHS);
7634   auto Orig = IsSigned ? MIRBuilder.buildAShr(Ty, Result, RHS)
7635                        : MIRBuilder.buildLShr(Ty, Result, RHS);
7636 
7637   MachineInstrBuilder SatVal;
7638   if (IsSigned) {
7639     auto SatMin = MIRBuilder.buildConstant(Ty, APInt::getSignedMinValue(BW));
7640     auto SatMax = MIRBuilder.buildConstant(Ty, APInt::getSignedMaxValue(BW));
7641     auto Cmp = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, LHS,
7642                                     MIRBuilder.buildConstant(Ty, 0));
7643     SatVal = MIRBuilder.buildSelect(Ty, Cmp, SatMin, SatMax);
7644   } else {
7645     SatVal = MIRBuilder.buildConstant(Ty, APInt::getMaxValue(BW));
7646   }
7647   auto Ov = MIRBuilder.buildICmp(CmpInst::ICMP_NE, BoolTy, LHS, Orig);
7648   MIRBuilder.buildSelect(Res, Ov, SatVal, Result);
7649 
7650   MI.eraseFromParent();
7651   return Legalized;
7652 }
7653 
7654 LegalizerHelper::LegalizeResult LegalizerHelper::lowerBswap(MachineInstr &MI) {
7655   auto [Dst, Src] = MI.getFirst2Regs();
7656   const LLT Ty = MRI.getType(Src);
7657   unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
7658   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
7659 
7660   // Swap most and least significant byte, set remaining bytes in Res to zero.
7661   auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt);
7662   auto LSByteShiftedLeft = MIRBuilder.buildShl(Ty, Src, ShiftAmt);
7663   auto MSByteShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7664   auto Res = MIRBuilder.buildOr(Ty, MSByteShiftedRight, LSByteShiftedLeft);
7665 
7666   // Set i-th high/low byte in Res to i-th low/high byte from Src.
7667   for (unsigned i = 1; i < SizeInBytes / 2; ++i) {
7668     // AND with Mask leaves byte i unchanged and sets remaining bytes to 0.
7669     APInt APMask(SizeInBytes * 8, 0xFF << (i * 8));
7670     auto Mask = MIRBuilder.buildConstant(Ty, APMask);
7671     auto ShiftAmt = MIRBuilder.buildConstant(Ty, BaseShiftAmt - 16 * i);
7672     // Low byte shifted left to place of high byte: (Src & Mask) << ShiftAmt.
7673     auto LoByte = MIRBuilder.buildAnd(Ty, Src, Mask);
7674     auto LoShiftedLeft = MIRBuilder.buildShl(Ty, LoByte, ShiftAmt);
7675     Res = MIRBuilder.buildOr(Ty, Res, LoShiftedLeft);
7676     // High byte shifted right to place of low byte: (Src >> ShiftAmt) & Mask.
7677     auto SrcShiftedRight = MIRBuilder.buildLShr(Ty, Src, ShiftAmt);
7678     auto HiShiftedRight = MIRBuilder.buildAnd(Ty, SrcShiftedRight, Mask);
7679     Res = MIRBuilder.buildOr(Ty, Res, HiShiftedRight);
7680   }
7681   Res.getInstr()->getOperand(0).setReg(Dst);
7682 
7683   MI.eraseFromParent();
7684   return Legalized;
7685 }
7686 
7687 //{ (Src & Mask) >> N } | { (Src << N) & Mask }
7688 static MachineInstrBuilder SwapN(unsigned N, DstOp Dst, MachineIRBuilder &B,
7689                                  MachineInstrBuilder Src, APInt Mask) {
7690   const LLT Ty = Dst.getLLTTy(*B.getMRI());
7691   MachineInstrBuilder C_N = B.buildConstant(Ty, N);
7692   MachineInstrBuilder MaskLoNTo0 = B.buildConstant(Ty, Mask);
7693   auto LHS = B.buildLShr(Ty, B.buildAnd(Ty, Src, MaskLoNTo0), C_N);
7694   auto RHS = B.buildAnd(Ty, B.buildShl(Ty, Src, C_N), MaskLoNTo0);
7695   return B.buildOr(Dst, LHS, RHS);
7696 }
7697 
7698 LegalizerHelper::LegalizeResult
7699 LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
7700   auto [Dst, Src] = MI.getFirst2Regs();
7701   const LLT Ty = MRI.getType(Src);
7702   unsigned Size = Ty.getSizeInBits();
7703 
7704   MachineInstrBuilder BSWAP =
7705       MIRBuilder.buildInstr(TargetOpcode::G_BSWAP, {Ty}, {Src});
7706 
7707   // swap high and low 4 bits in 8 bit blocks 7654|3210 -> 3210|7654
7708   //    [(val & 0xF0F0F0F0) >> 4] | [(val & 0x0F0F0F0F) << 4]
7709   // -> [(val & 0xF0F0F0F0) >> 4] | [(val << 4) & 0xF0F0F0F0]
7710   MachineInstrBuilder Swap4 =
7711       SwapN(4, Ty, MIRBuilder, BSWAP, APInt::getSplat(Size, APInt(8, 0xF0)));
7712 
7713   // swap high and low 2 bits in 4 bit blocks 32|10 76|54 -> 10|32 54|76
7714   //    [(val & 0xCCCCCCCC) >> 2] & [(val & 0x33333333) << 2]
7715   // -> [(val & 0xCCCCCCCC) >> 2] & [(val << 2) & 0xCCCCCCCC]
7716   MachineInstrBuilder Swap2 =
7717       SwapN(2, Ty, MIRBuilder, Swap4, APInt::getSplat(Size, APInt(8, 0xCC)));
7718 
7719   // swap high and low 1 bit in 2 bit blocks 1|0 3|2 5|4 7|6 -> 0|1 2|3 4|5 6|7
7720   //    [(val & 0xAAAAAAAA) >> 1] & [(val & 0x55555555) << 1]
7721   // -> [(val & 0xAAAAAAAA) >> 1] & [(val << 1) & 0xAAAAAAAA]
7722   SwapN(1, Dst, MIRBuilder, Swap2, APInt::getSplat(Size, APInt(8, 0xAA)));
7723 
7724   MI.eraseFromParent();
7725   return Legalized;
7726 }
7727 
7728 LegalizerHelper::LegalizeResult
7729 LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
7730   MachineFunction &MF = MIRBuilder.getMF();
7731 
7732   bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
7733   int NameOpIdx = IsRead ? 1 : 0;
7734   int ValRegIndex = IsRead ? 0 : 1;
7735 
7736   Register ValReg = MI.getOperand(ValRegIndex).getReg();
7737   const LLT Ty = MRI.getType(ValReg);
7738   const MDString *RegStr = cast<MDString>(
7739     cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
7740 
7741   Register PhysReg = TLI.getRegisterByName(RegStr->getString().data(), Ty, MF);
7742   if (!PhysReg.isValid())
7743     return UnableToLegalize;
7744 
7745   if (IsRead)
7746     MIRBuilder.buildCopy(ValReg, PhysReg);
7747   else
7748     MIRBuilder.buildCopy(PhysReg, ValReg);
7749 
7750   MI.eraseFromParent();
7751   return Legalized;
7752 }
7753 
7754 LegalizerHelper::LegalizeResult
7755 LegalizerHelper::lowerSMULH_UMULH(MachineInstr &MI) {
7756   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SMULH;
7757   unsigned ExtOp = IsSigned ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
7758   Register Result = MI.getOperand(0).getReg();
7759   LLT OrigTy = MRI.getType(Result);
7760   auto SizeInBits = OrigTy.getScalarSizeInBits();
7761   LLT WideTy = OrigTy.changeElementSize(SizeInBits * 2);
7762 
7763   auto LHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(1)});
7764   auto RHS = MIRBuilder.buildInstr(ExtOp, {WideTy}, {MI.getOperand(2)});
7765   auto Mul = MIRBuilder.buildMul(WideTy, LHS, RHS);
7766   unsigned ShiftOp = IsSigned ? TargetOpcode::G_ASHR : TargetOpcode::G_LSHR;
7767 
7768   auto ShiftAmt = MIRBuilder.buildConstant(WideTy, SizeInBits);
7769   auto Shifted = MIRBuilder.buildInstr(ShiftOp, {WideTy}, {Mul, ShiftAmt});
7770   MIRBuilder.buildTrunc(Result, Shifted);
7771 
7772   MI.eraseFromParent();
7773   return Legalized;
7774 }
7775 
7776 LegalizerHelper::LegalizeResult
7777 LegalizerHelper::lowerISFPCLASS(MachineInstr &MI) {
7778   auto [DstReg, DstTy, SrcReg, SrcTy] = MI.getFirst2RegLLTs();
7779   FPClassTest Mask = static_cast<FPClassTest>(MI.getOperand(2).getImm());
7780 
7781   if (Mask == fcNone) {
7782     MIRBuilder.buildConstant(DstReg, 0);
7783     MI.eraseFromParent();
7784     return Legalized;
7785   }
7786   if (Mask == fcAllFlags) {
7787     MIRBuilder.buildConstant(DstReg, 1);
7788     MI.eraseFromParent();
7789     return Legalized;
7790   }
7791 
7792   // TODO: Try inverting the test with getInvertedFPClassTest like the DAG
7793   // version
7794 
7795   unsigned BitSize = SrcTy.getScalarSizeInBits();
7796   const fltSemantics &Semantics = getFltSemanticForLLT(SrcTy.getScalarType());
7797 
7798   LLT IntTy = LLT::scalar(BitSize);
7799   if (SrcTy.isVector())
7800     IntTy = LLT::vector(SrcTy.getElementCount(), IntTy);
7801   auto AsInt = MIRBuilder.buildCopy(IntTy, SrcReg);
7802 
7803   // Various masks.
7804   APInt SignBit = APInt::getSignMask(BitSize);
7805   APInt ValueMask = APInt::getSignedMaxValue(BitSize);     // All bits but sign.
7806   APInt Inf = APFloat::getInf(Semantics).bitcastToAPInt(); // Exp and int bit.
7807   APInt ExpMask = Inf;
7808   APInt AllOneMantissa = APFloat::getLargest(Semantics).bitcastToAPInt() & ~Inf;
7809   APInt QNaNBitMask =
7810       APInt::getOneBitSet(BitSize, AllOneMantissa.getActiveBits() - 1);
7811   APInt InvertionMask = APInt::getAllOnes(DstTy.getScalarSizeInBits());
7812 
7813   auto SignBitC = MIRBuilder.buildConstant(IntTy, SignBit);
7814   auto ValueMaskC = MIRBuilder.buildConstant(IntTy, ValueMask);
7815   auto InfC = MIRBuilder.buildConstant(IntTy, Inf);
7816   auto ExpMaskC = MIRBuilder.buildConstant(IntTy, ExpMask);
7817   auto ZeroC = MIRBuilder.buildConstant(IntTy, 0);
7818 
7819   auto Abs = MIRBuilder.buildAnd(IntTy, AsInt, ValueMaskC);
7820   auto Sign =
7821       MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_NE, DstTy, AsInt, Abs);
7822 
7823   auto Res = MIRBuilder.buildConstant(DstTy, 0);
7824   // Clang doesn't support capture of structured bindings:
7825   LLT DstTyCopy = DstTy;
7826   const auto appendToRes = [&](MachineInstrBuilder ToAppend) {
7827     Res = MIRBuilder.buildOr(DstTyCopy, Res, ToAppend);
7828   };
7829 
7830   // Tests that involve more than one class should be processed first.
7831   if ((Mask & fcFinite) == fcFinite) {
7832     // finite(V) ==> abs(V) u< exp_mask
7833     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7834                                      ExpMaskC));
7835     Mask &= ~fcFinite;
7836   } else if ((Mask & fcFinite) == fcPosFinite) {
7837     // finite(V) && V > 0 ==> V u< exp_mask
7838     appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, AsInt,
7839                                      ExpMaskC));
7840     Mask &= ~fcPosFinite;
7841   } else if ((Mask & fcFinite) == fcNegFinite) {
7842     // finite(V) && V < 0 ==> abs(V) u< exp_mask && signbit == 1
7843     auto Cmp = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, Abs,
7844                                     ExpMaskC);
7845     auto And = MIRBuilder.buildAnd(DstTy, Cmp, Sign);
7846     appendToRes(And);
7847     Mask &= ~fcNegFinite;
7848   }
7849 
7850   if (FPClassTest PartialCheck = Mask & (fcZero | fcSubnormal)) {
7851     // fcZero | fcSubnormal => test all exponent bits are 0
7852     // TODO: Handle sign bit specific cases
7853     // TODO: Handle inverted case
7854     if (PartialCheck == (fcZero | fcSubnormal)) {
7855       auto ExpBits = MIRBuilder.buildAnd(IntTy, AsInt, ExpMaskC);
7856       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7857                                        ExpBits, ZeroC));
7858       Mask &= ~PartialCheck;
7859     }
7860   }
7861 
7862   // Check for individual classes.
7863   if (FPClassTest PartialCheck = Mask & fcZero) {
7864     if (PartialCheck == fcPosZero)
7865       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7866                                        AsInt, ZeroC));
7867     else if (PartialCheck == fcZero)
7868       appendToRes(
7869           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, ZeroC));
7870     else // fcNegZero
7871       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7872                                        AsInt, SignBitC));
7873   }
7874 
7875   if (FPClassTest PartialCheck = Mask & fcSubnormal) {
7876     // issubnormal(V) ==> unsigned(abs(V) - 1) u< (all mantissa bits set)
7877     // issubnormal(V) && V>0 ==> unsigned(V - 1) u< (all mantissa bits set)
7878     auto V = (PartialCheck == fcPosSubnormal) ? AsInt : Abs;
7879     auto OneC = MIRBuilder.buildConstant(IntTy, 1);
7880     auto VMinusOne = MIRBuilder.buildSub(IntTy, V, OneC);
7881     auto SubnormalRes =
7882         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, VMinusOne,
7883                              MIRBuilder.buildConstant(IntTy, AllOneMantissa));
7884     if (PartialCheck == fcNegSubnormal)
7885       SubnormalRes = MIRBuilder.buildAnd(DstTy, SubnormalRes, Sign);
7886     appendToRes(SubnormalRes);
7887   }
7888 
7889   if (FPClassTest PartialCheck = Mask & fcInf) {
7890     if (PartialCheck == fcPosInf)
7891       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7892                                        AsInt, InfC));
7893     else if (PartialCheck == fcInf)
7894       appendToRes(
7895           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy, Abs, InfC));
7896     else { // fcNegInf
7897       APInt NegInf = APFloat::getInf(Semantics, true).bitcastToAPInt();
7898       auto NegInfC = MIRBuilder.buildConstant(IntTy, NegInf);
7899       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, DstTy,
7900                                        AsInt, NegInfC));
7901     }
7902   }
7903 
7904   if (FPClassTest PartialCheck = Mask & fcNan) {
7905     auto InfWithQnanBitC = MIRBuilder.buildConstant(IntTy, Inf | QNaNBitMask);
7906     if (PartialCheck == fcNan) {
7907       // isnan(V) ==> abs(V) u> int(inf)
7908       appendToRes(
7909           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC));
7910     } else if (PartialCheck == fcQNan) {
7911       // isquiet(V) ==> abs(V) u>= (unsigned(Inf) | quiet_bit)
7912       appendToRes(MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGE, DstTy, Abs,
7913                                        InfWithQnanBitC));
7914     } else { // fcSNan
7915       // issignaling(V) ==> abs(V) u> unsigned(Inf) &&
7916       //                    abs(V) u< (unsigned(Inf) | quiet_bit)
7917       auto IsNan =
7918           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_UGT, DstTy, Abs, InfC);
7919       auto IsNotQnan = MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy,
7920                                             Abs, InfWithQnanBitC);
7921       appendToRes(MIRBuilder.buildAnd(DstTy, IsNan, IsNotQnan));
7922     }
7923   }
7924 
7925   if (FPClassTest PartialCheck = Mask & fcNormal) {
7926     // isnormal(V) ==> (0 u< exp u< max_exp) ==> (unsigned(exp-1) u<
7927     // (max_exp-1))
7928     APInt ExpLSB = ExpMask & ~(ExpMask.shl(1));
7929     auto ExpMinusOne = MIRBuilder.buildSub(
7930         IntTy, Abs, MIRBuilder.buildConstant(IntTy, ExpLSB));
7931     APInt MaxExpMinusOne = ExpMask - ExpLSB;
7932     auto NormalRes =
7933         MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_ULT, DstTy, ExpMinusOne,
7934                              MIRBuilder.buildConstant(IntTy, MaxExpMinusOne));
7935     if (PartialCheck == fcNegNormal)
7936       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, Sign);
7937     else if (PartialCheck == fcPosNormal) {
7938       auto PosSign = MIRBuilder.buildXor(
7939           DstTy, Sign, MIRBuilder.buildConstant(DstTy, InvertionMask));
7940       NormalRes = MIRBuilder.buildAnd(DstTy, NormalRes, PosSign);
7941     }
7942     appendToRes(NormalRes);
7943   }
7944 
7945   MIRBuilder.buildCopy(DstReg, Res);
7946   MI.eraseFromParent();
7947   return Legalized;
7948 }
7949 
7950 LegalizerHelper::LegalizeResult LegalizerHelper::lowerSelect(MachineInstr &MI) {
7951   // Implement vector G_SELECT in terms of XOR, AND, OR.
7952   auto [DstReg, DstTy, MaskReg, MaskTy, Op1Reg, Op1Ty, Op2Reg, Op2Ty] =
7953       MI.getFirst4RegLLTs();
7954   if (!DstTy.isVector())
7955     return UnableToLegalize;
7956 
7957   bool IsEltPtr = DstTy.getElementType().isPointer();
7958   if (IsEltPtr) {
7959     LLT ScalarPtrTy = LLT::scalar(DstTy.getScalarSizeInBits());
7960     LLT NewTy = DstTy.changeElementType(ScalarPtrTy);
7961     Op1Reg = MIRBuilder.buildPtrToInt(NewTy, Op1Reg).getReg(0);
7962     Op2Reg = MIRBuilder.buildPtrToInt(NewTy, Op2Reg).getReg(0);
7963     DstTy = NewTy;
7964   }
7965 
7966   if (MaskTy.isScalar()) {
7967     // Turn the scalar condition into a vector condition mask.
7968 
7969     Register MaskElt = MaskReg;
7970 
7971     // The condition was potentially zero extended before, but we want a sign
7972     // extended boolean.
7973     if (MaskTy != LLT::scalar(1))
7974       MaskElt = MIRBuilder.buildSExtInReg(MaskTy, MaskElt, 1).getReg(0);
7975 
7976     // Continue the sign extension (or truncate) to match the data type.
7977     MaskElt = MIRBuilder.buildSExtOrTrunc(DstTy.getElementType(),
7978                                           MaskElt).getReg(0);
7979 
7980     // Generate a vector splat idiom.
7981     auto ShufSplat = MIRBuilder.buildShuffleSplat(DstTy, MaskElt);
7982     MaskReg = ShufSplat.getReg(0);
7983     MaskTy = DstTy;
7984   }
7985 
7986   if (MaskTy.getSizeInBits() != DstTy.getSizeInBits()) {
7987     return UnableToLegalize;
7988   }
7989 
7990   auto NotMask = MIRBuilder.buildNot(MaskTy, MaskReg);
7991   auto NewOp1 = MIRBuilder.buildAnd(MaskTy, Op1Reg, MaskReg);
7992   auto NewOp2 = MIRBuilder.buildAnd(MaskTy, Op2Reg, NotMask);
7993   if (IsEltPtr) {
7994     auto Or = MIRBuilder.buildOr(DstTy, NewOp1, NewOp2);
7995     MIRBuilder.buildIntToPtr(DstReg, Or);
7996   } else {
7997     MIRBuilder.buildOr(DstReg, NewOp1, NewOp2);
7998   }
7999   MI.eraseFromParent();
8000   return Legalized;
8001 }
8002 
8003 LegalizerHelper::LegalizeResult LegalizerHelper::lowerDIVREM(MachineInstr &MI) {
8004   // Split DIVREM into individual instructions.
8005   unsigned Opcode = MI.getOpcode();
8006 
8007   MIRBuilder.buildInstr(
8008       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SDIV
8009                                         : TargetOpcode::G_UDIV,
8010       {MI.getOperand(0).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8011   MIRBuilder.buildInstr(
8012       Opcode == TargetOpcode::G_SDIVREM ? TargetOpcode::G_SREM
8013                                         : TargetOpcode::G_UREM,
8014       {MI.getOperand(1).getReg()}, {MI.getOperand(2), MI.getOperand(3)});
8015   MI.eraseFromParent();
8016   return Legalized;
8017 }
8018 
8019 LegalizerHelper::LegalizeResult
8020 LegalizerHelper::lowerAbsToAddXor(MachineInstr &MI) {
8021   // Expand %res = G_ABS %a into:
8022   // %v1 = G_ASHR %a, scalar_size-1
8023   // %v2 = G_ADD %a, %v1
8024   // %res = G_XOR %v2, %v1
8025   LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
8026   Register OpReg = MI.getOperand(1).getReg();
8027   auto ShiftAmt =
8028       MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - 1);
8029   auto Shift = MIRBuilder.buildAShr(DstTy, OpReg, ShiftAmt);
8030   auto Add = MIRBuilder.buildAdd(DstTy, OpReg, Shift);
8031   MIRBuilder.buildXor(MI.getOperand(0).getReg(), Add, Shift);
8032   MI.eraseFromParent();
8033   return Legalized;
8034 }
8035 
8036 LegalizerHelper::LegalizeResult
8037 LegalizerHelper::lowerAbsToMaxNeg(MachineInstr &MI) {
8038   // Expand %res = G_ABS %a into:
8039   // %v1 = G_CONSTANT 0
8040   // %v2 = G_SUB %v1, %a
8041   // %res = G_SMAX %a, %v2
8042   Register SrcReg = MI.getOperand(1).getReg();
8043   LLT Ty = MRI.getType(SrcReg);
8044   auto Zero = MIRBuilder.buildConstant(Ty, 0).getReg(0);
8045   auto Sub = MIRBuilder.buildSub(Ty, Zero, SrcReg).getReg(0);
8046   MIRBuilder.buildSMax(MI.getOperand(0), SrcReg, Sub);
8047   MI.eraseFromParent();
8048   return Legalized;
8049 }
8050 
8051 LegalizerHelper::LegalizeResult
8052 LegalizerHelper::lowerVectorReduction(MachineInstr &MI) {
8053   Register SrcReg = MI.getOperand(1).getReg();
8054   LLT SrcTy = MRI.getType(SrcReg);
8055   LLT DstTy = MRI.getType(SrcReg);
8056 
8057   // The source could be a scalar if the IR type was <1 x sN>.
8058   if (SrcTy.isScalar()) {
8059     if (DstTy.getSizeInBits() > SrcTy.getSizeInBits())
8060       return UnableToLegalize; // FIXME: handle extension.
8061     // This can be just a plain copy.
8062     Observer.changingInstr(MI);
8063     MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::COPY));
8064     Observer.changedInstr(MI);
8065     return Legalized;
8066   }
8067   return UnableToLegalize;
8068 }
8069 
8070 static Type *getTypeForLLT(LLT Ty, LLVMContext &C);
8071 
8072 LegalizerHelper::LegalizeResult LegalizerHelper::lowerVAArg(MachineInstr &MI) {
8073   MachineFunction &MF = *MI.getMF();
8074   const DataLayout &DL = MIRBuilder.getDataLayout();
8075   LLVMContext &Ctx = MF.getFunction().getContext();
8076   Register ListPtr = MI.getOperand(1).getReg();
8077   LLT PtrTy = MRI.getType(ListPtr);
8078 
8079   // LstPtr is a pointer to the head of the list. Get the address
8080   // of the head of the list.
8081   Align PtrAlignment = DL.getABITypeAlign(getTypeForLLT(PtrTy, Ctx));
8082   MachineMemOperand *PtrLoadMMO = MF.getMachineMemOperand(
8083       MachinePointerInfo(), MachineMemOperand::MOLoad, PtrTy, PtrAlignment);
8084   auto VAList = MIRBuilder.buildLoad(PtrTy, ListPtr, *PtrLoadMMO).getReg(0);
8085 
8086   const Align A(MI.getOperand(2).getImm());
8087   LLT PtrTyAsScalarTy = LLT::scalar(PtrTy.getSizeInBits());
8088   if (A > TLI.getMinStackArgumentAlignment()) {
8089     Register AlignAmt =
8090         MIRBuilder.buildConstant(PtrTyAsScalarTy, A.value() - 1).getReg(0);
8091     auto AddDst = MIRBuilder.buildPtrAdd(PtrTy, VAList, AlignAmt);
8092     auto AndDst = MIRBuilder.buildMaskLowPtrBits(PtrTy, AddDst, Log2(A));
8093     VAList = AndDst.getReg(0);
8094   }
8095 
8096   // Increment the pointer, VAList, to the next vaarg
8097   // The list should be bumped by the size of element in the current head of
8098   // list.
8099   Register Dst = MI.getOperand(0).getReg();
8100   LLT LLTTy = MRI.getType(Dst);
8101   Type *Ty = getTypeForLLT(LLTTy, Ctx);
8102   auto IncAmt =
8103       MIRBuilder.buildConstant(PtrTyAsScalarTy, DL.getTypeAllocSize(Ty));
8104   auto Succ = MIRBuilder.buildPtrAdd(PtrTy, VAList, IncAmt);
8105 
8106   // Store the increment VAList to the legalized pointer
8107   MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
8108       MachinePointerInfo(), MachineMemOperand::MOStore, PtrTy, PtrAlignment);
8109   MIRBuilder.buildStore(Succ, ListPtr, *StoreMMO);
8110   // Load the actual argument out of the pointer VAList
8111   Align EltAlignment = DL.getABITypeAlign(Ty);
8112   MachineMemOperand *EltLoadMMO = MF.getMachineMemOperand(
8113       MachinePointerInfo(), MachineMemOperand::MOLoad, LLTTy, EltAlignment);
8114   MIRBuilder.buildLoad(Dst, VAList, *EltLoadMMO);
8115 
8116   MI.eraseFromParent();
8117   return Legalized;
8118 }
8119 
8120 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
8121   // On Darwin, -Os means optimize for size without hurting performance, so
8122   // only really optimize for size when -Oz (MinSize) is used.
8123   if (MF.getTarget().getTargetTriple().isOSDarwin())
8124     return MF.getFunction().hasMinSize();
8125   return MF.getFunction().hasOptSize();
8126 }
8127 
8128 // Returns a list of types to use for memory op lowering in MemOps. A partial
8129 // port of findOptimalMemOpLowering in TargetLowering.
8130 static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
8131                                           unsigned Limit, const MemOp &Op,
8132                                           unsigned DstAS, unsigned SrcAS,
8133                                           const AttributeList &FuncAttributes,
8134                                           const TargetLowering &TLI) {
8135   if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
8136     return false;
8137 
8138   LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
8139 
8140   if (Ty == LLT()) {
8141     // Use the largest scalar type whose alignment constraints are satisfied.
8142     // We only need to check DstAlign here as SrcAlign is always greater or
8143     // equal to DstAlign (or zero).
8144     Ty = LLT::scalar(64);
8145     if (Op.isFixedDstAlign())
8146       while (Op.getDstAlign() < Ty.getSizeInBytes() &&
8147              !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
8148         Ty = LLT::scalar(Ty.getSizeInBytes());
8149     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
8150     // FIXME: check for the largest legal type we can load/store to.
8151   }
8152 
8153   unsigned NumMemOps = 0;
8154   uint64_t Size = Op.size();
8155   while (Size) {
8156     unsigned TySize = Ty.getSizeInBytes();
8157     while (TySize > Size) {
8158       // For now, only use non-vector load / store's for the left-over pieces.
8159       LLT NewTy = Ty;
8160       // FIXME: check for mem op safety and legality of the types. Not all of
8161       // SDAGisms map cleanly to GISel concepts.
8162       if (NewTy.isVector())
8163         NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
8164       NewTy = LLT::scalar(llvm::bit_floor(NewTy.getSizeInBits() - 1));
8165       unsigned NewTySize = NewTy.getSizeInBytes();
8166       assert(NewTySize > 0 && "Could not find appropriate type");
8167 
8168       // If the new LLT cannot cover all of the remaining bits, then consider
8169       // issuing a (or a pair of) unaligned and overlapping load / store.
8170       unsigned Fast;
8171       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
8172       MVT VT = getMVTForLLT(Ty);
8173       if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
8174           TLI.allowsMisalignedMemoryAccesses(
8175               VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign() : Align(1),
8176               MachineMemOperand::MONone, &Fast) &&
8177           Fast)
8178         TySize = Size;
8179       else {
8180         Ty = NewTy;
8181         TySize = NewTySize;
8182       }
8183     }
8184 
8185     if (++NumMemOps > Limit)
8186       return false;
8187 
8188     MemOps.push_back(Ty);
8189     Size -= TySize;
8190   }
8191 
8192   return true;
8193 }
8194 
8195 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
8196   if (Ty.isVector())
8197     return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
8198                                 Ty.getNumElements());
8199   return IntegerType::get(C, Ty.getSizeInBits());
8200 }
8201 
8202 // Get a vectorized representation of the memset value operand, GISel edition.
8203 static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
8204   MachineRegisterInfo &MRI = *MIB.getMRI();
8205   unsigned NumBits = Ty.getScalarSizeInBits();
8206   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8207   if (!Ty.isVector() && ValVRegAndVal) {
8208     APInt Scalar = ValVRegAndVal->Value.trunc(8);
8209     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
8210     return MIB.buildConstant(Ty, SplatVal).getReg(0);
8211   }
8212 
8213   // Extend the byte value to the larger type, and then multiply by a magic
8214   // value 0x010101... in order to replicate it across every byte.
8215   // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
8216   if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
8217     return MIB.buildConstant(Ty, 0).getReg(0);
8218   }
8219 
8220   LLT ExtType = Ty.getScalarType();
8221   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
8222   if (NumBits > 8) {
8223     APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
8224     auto MagicMI = MIB.buildConstant(ExtType, Magic);
8225     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
8226   }
8227 
8228   // For vector types create a G_BUILD_VECTOR.
8229   if (Ty.isVector())
8230     Val = MIB.buildSplatVector(Ty, Val).getReg(0);
8231 
8232   return Val;
8233 }
8234 
8235 LegalizerHelper::LegalizeResult
8236 LegalizerHelper::lowerMemset(MachineInstr &MI, Register Dst, Register Val,
8237                              uint64_t KnownLen, Align Alignment,
8238                              bool IsVolatile) {
8239   auto &MF = *MI.getParent()->getParent();
8240   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8241   auto &DL = MF.getDataLayout();
8242   LLVMContext &C = MF.getFunction().getContext();
8243 
8244   assert(KnownLen != 0 && "Have a zero length memset length!");
8245 
8246   bool DstAlignCanChange = false;
8247   MachineFrameInfo &MFI = MF.getFrameInfo();
8248   bool OptSize = shouldLowerMemFuncForSize(MF);
8249 
8250   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8251   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8252     DstAlignCanChange = true;
8253 
8254   unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
8255   std::vector<LLT> MemOps;
8256 
8257   const auto &DstMMO = **MI.memoperands_begin();
8258   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8259 
8260   auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI);
8261   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
8262 
8263   if (!findGISelOptimalMemOpLowering(MemOps, Limit,
8264                                      MemOp::Set(KnownLen, DstAlignCanChange,
8265                                                 Alignment,
8266                                                 /*IsZeroMemset=*/IsZeroVal,
8267                                                 /*IsVolatile=*/IsVolatile),
8268                                      DstPtrInfo.getAddrSpace(), ~0u,
8269                                      MF.getFunction().getAttributes(), TLI))
8270     return UnableToLegalize;
8271 
8272   if (DstAlignCanChange) {
8273     // Get an estimate of the type from the LLT.
8274     Type *IRTy = getTypeForLLT(MemOps[0], C);
8275     Align NewAlign = DL.getABITypeAlign(IRTy);
8276     if (NewAlign > Alignment) {
8277       Alignment = NewAlign;
8278       unsigned FI = FIDef->getOperand(1).getIndex();
8279       // Give the stack frame object a larger alignment if needed.
8280       if (MFI.getObjectAlign(FI) < Alignment)
8281         MFI.setObjectAlignment(FI, Alignment);
8282     }
8283   }
8284 
8285   MachineIRBuilder MIB(MI);
8286   // Find the largest store and generate the bit pattern for it.
8287   LLT LargestTy = MemOps[0];
8288   for (unsigned i = 1; i < MemOps.size(); i++)
8289     if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
8290       LargestTy = MemOps[i];
8291 
8292   // The memset stored value is always defined as an s8, so in order to make it
8293   // work with larger store types we need to repeat the bit pattern across the
8294   // wider type.
8295   Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
8296 
8297   if (!MemSetValue)
8298     return UnableToLegalize;
8299 
8300   // Generate the stores. For each store type in the list, we generate the
8301   // matching store of that type to the destination address.
8302   LLT PtrTy = MRI.getType(Dst);
8303   unsigned DstOff = 0;
8304   unsigned Size = KnownLen;
8305   for (unsigned I = 0; I < MemOps.size(); I++) {
8306     LLT Ty = MemOps[I];
8307     unsigned TySize = Ty.getSizeInBytes();
8308     if (TySize > Size) {
8309       // Issuing an unaligned load / store pair that overlaps with the previous
8310       // pair. Adjust the offset accordingly.
8311       assert(I == MemOps.size() - 1 && I != 0);
8312       DstOff -= TySize - Size;
8313     }
8314 
8315     // If this store is smaller than the largest store see whether we can get
8316     // the smaller value for free with a truncate.
8317     Register Value = MemSetValue;
8318     if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
8319       MVT VT = getMVTForLLT(Ty);
8320       MVT LargestVT = getMVTForLLT(LargestTy);
8321       if (!LargestTy.isVector() && !Ty.isVector() &&
8322           TLI.isTruncateFree(LargestVT, VT))
8323         Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
8324       else
8325         Value = getMemsetValue(Val, Ty, MIB);
8326       if (!Value)
8327         return UnableToLegalize;
8328     }
8329 
8330     auto *StoreMMO = MF.getMachineMemOperand(&DstMMO, DstOff, Ty);
8331 
8332     Register Ptr = Dst;
8333     if (DstOff != 0) {
8334       auto Offset =
8335           MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
8336       Ptr = MIB.buildPtrAdd(PtrTy, Dst, Offset).getReg(0);
8337     }
8338 
8339     MIB.buildStore(Value, Ptr, *StoreMMO);
8340     DstOff += Ty.getSizeInBytes();
8341     Size -= TySize;
8342   }
8343 
8344   MI.eraseFromParent();
8345   return Legalized;
8346 }
8347 
8348 LegalizerHelper::LegalizeResult
8349 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI) {
8350   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8351 
8352   auto [Dst, Src, Len] = MI.getFirst3Regs();
8353 
8354   const auto *MMOIt = MI.memoperands_begin();
8355   const MachineMemOperand *MemOp = *MMOIt;
8356   bool IsVolatile = MemOp->isVolatile();
8357 
8358   // See if this is a constant length copy
8359   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8360   // FIXME: support dynamically sized G_MEMCPY_INLINE
8361   assert(LenVRegAndVal &&
8362          "inline memcpy with dynamic size is not yet supported");
8363   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8364   if (KnownLen == 0) {
8365     MI.eraseFromParent();
8366     return Legalized;
8367   }
8368 
8369   const auto &DstMMO = **MI.memoperands_begin();
8370   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8371   Align DstAlign = DstMMO.getBaseAlign();
8372   Align SrcAlign = SrcMMO.getBaseAlign();
8373 
8374   return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8375                            IsVolatile);
8376 }
8377 
8378 LegalizerHelper::LegalizeResult
8379 LegalizerHelper::lowerMemcpyInline(MachineInstr &MI, Register Dst, Register Src,
8380                                    uint64_t KnownLen, Align DstAlign,
8381                                    Align SrcAlign, bool IsVolatile) {
8382   assert(MI.getOpcode() == TargetOpcode::G_MEMCPY_INLINE);
8383   return lowerMemcpy(MI, Dst, Src, KnownLen,
8384                      std::numeric_limits<uint64_t>::max(), DstAlign, SrcAlign,
8385                      IsVolatile);
8386 }
8387 
8388 LegalizerHelper::LegalizeResult
8389 LegalizerHelper::lowerMemcpy(MachineInstr &MI, Register Dst, Register Src,
8390                              uint64_t KnownLen, uint64_t Limit, Align DstAlign,
8391                              Align SrcAlign, bool IsVolatile) {
8392   auto &MF = *MI.getParent()->getParent();
8393   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8394   auto &DL = MF.getDataLayout();
8395   LLVMContext &C = MF.getFunction().getContext();
8396 
8397   assert(KnownLen != 0 && "Have a zero length memcpy length!");
8398 
8399   bool DstAlignCanChange = false;
8400   MachineFrameInfo &MFI = MF.getFrameInfo();
8401   Align Alignment = std::min(DstAlign, SrcAlign);
8402 
8403   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8404   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8405     DstAlignCanChange = true;
8406 
8407   // FIXME: infer better src pointer alignment like SelectionDAG does here.
8408   // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
8409   // if the memcpy is in a tail call position.
8410 
8411   std::vector<LLT> MemOps;
8412 
8413   const auto &DstMMO = **MI.memoperands_begin();
8414   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8415   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8416   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8417 
8418   if (!findGISelOptimalMemOpLowering(
8419           MemOps, Limit,
8420           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8421                       IsVolatile),
8422           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8423           MF.getFunction().getAttributes(), TLI))
8424     return UnableToLegalize;
8425 
8426   if (DstAlignCanChange) {
8427     // Get an estimate of the type from the LLT.
8428     Type *IRTy = getTypeForLLT(MemOps[0], C);
8429     Align NewAlign = DL.getABITypeAlign(IRTy);
8430 
8431     // Don't promote to an alignment that would require dynamic stack
8432     // realignment.
8433     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8434     if (!TRI->hasStackRealignment(MF))
8435       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8436         NewAlign = NewAlign.previous();
8437 
8438     if (NewAlign > Alignment) {
8439       Alignment = NewAlign;
8440       unsigned FI = FIDef->getOperand(1).getIndex();
8441       // Give the stack frame object a larger alignment if needed.
8442       if (MFI.getObjectAlign(FI) < Alignment)
8443         MFI.setObjectAlignment(FI, Alignment);
8444     }
8445   }
8446 
8447   LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
8448 
8449   MachineIRBuilder MIB(MI);
8450   // Now we need to emit a pair of load and stores for each of the types we've
8451   // collected. I.e. for each type, generate a load from the source pointer of
8452   // that type width, and then generate a corresponding store to the dest buffer
8453   // of that value loaded. This can result in a sequence of loads and stores
8454   // mixed types, depending on what the target specifies as good types to use.
8455   unsigned CurrOffset = 0;
8456   unsigned Size = KnownLen;
8457   for (auto CopyTy : MemOps) {
8458     // Issuing an unaligned load / store pair  that overlaps with the previous
8459     // pair. Adjust the offset accordingly.
8460     if (CopyTy.getSizeInBytes() > Size)
8461       CurrOffset -= CopyTy.getSizeInBytes() - Size;
8462 
8463     // Construct MMOs for the accesses.
8464     auto *LoadMMO =
8465         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8466     auto *StoreMMO =
8467         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8468 
8469     // Create the load.
8470     Register LoadPtr = Src;
8471     Register Offset;
8472     if (CurrOffset != 0) {
8473       LLT SrcTy = MRI.getType(Src);
8474       Offset = MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset)
8475                    .getReg(0);
8476       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8477     }
8478     auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
8479 
8480     // Create the store.
8481     Register StorePtr = Dst;
8482     if (CurrOffset != 0) {
8483       LLT DstTy = MRI.getType(Dst);
8484       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8485     }
8486     MIB.buildStore(LdVal, StorePtr, *StoreMMO);
8487     CurrOffset += CopyTy.getSizeInBytes();
8488     Size -= CopyTy.getSizeInBytes();
8489   }
8490 
8491   MI.eraseFromParent();
8492   return Legalized;
8493 }
8494 
8495 LegalizerHelper::LegalizeResult
8496 LegalizerHelper::lowerMemmove(MachineInstr &MI, Register Dst, Register Src,
8497                               uint64_t KnownLen, Align DstAlign, Align SrcAlign,
8498                               bool IsVolatile) {
8499   auto &MF = *MI.getParent()->getParent();
8500   const auto &TLI = *MF.getSubtarget().getTargetLowering();
8501   auto &DL = MF.getDataLayout();
8502   LLVMContext &C = MF.getFunction().getContext();
8503 
8504   assert(KnownLen != 0 && "Have a zero length memmove length!");
8505 
8506   bool DstAlignCanChange = false;
8507   MachineFrameInfo &MFI = MF.getFrameInfo();
8508   bool OptSize = shouldLowerMemFuncForSize(MF);
8509   Align Alignment = std::min(DstAlign, SrcAlign);
8510 
8511   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
8512   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
8513     DstAlignCanChange = true;
8514 
8515   unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
8516   std::vector<LLT> MemOps;
8517 
8518   const auto &DstMMO = **MI.memoperands_begin();
8519   const auto &SrcMMO = **std::next(MI.memoperands_begin());
8520   MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
8521   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
8522 
8523   // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
8524   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
8525   // same thing here.
8526   if (!findGISelOptimalMemOpLowering(
8527           MemOps, Limit,
8528           MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
8529                       /*IsVolatile*/ true),
8530           DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
8531           MF.getFunction().getAttributes(), TLI))
8532     return UnableToLegalize;
8533 
8534   if (DstAlignCanChange) {
8535     // Get an estimate of the type from the LLT.
8536     Type *IRTy = getTypeForLLT(MemOps[0], C);
8537     Align NewAlign = DL.getABITypeAlign(IRTy);
8538 
8539     // Don't promote to an alignment that would require dynamic stack
8540     // realignment.
8541     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
8542     if (!TRI->hasStackRealignment(MF))
8543       while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
8544         NewAlign = NewAlign.previous();
8545 
8546     if (NewAlign > Alignment) {
8547       Alignment = NewAlign;
8548       unsigned FI = FIDef->getOperand(1).getIndex();
8549       // Give the stack frame object a larger alignment if needed.
8550       if (MFI.getObjectAlign(FI) < Alignment)
8551         MFI.setObjectAlignment(FI, Alignment);
8552     }
8553   }
8554 
8555   LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
8556 
8557   MachineIRBuilder MIB(MI);
8558   // Memmove requires that we perform the loads first before issuing the stores.
8559   // Apart from that, this loop is pretty much doing the same thing as the
8560   // memcpy codegen function.
8561   unsigned CurrOffset = 0;
8562   SmallVector<Register, 16> LoadVals;
8563   for (auto CopyTy : MemOps) {
8564     // Construct MMO for the load.
8565     auto *LoadMMO =
8566         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
8567 
8568     // Create the load.
8569     Register LoadPtr = Src;
8570     if (CurrOffset != 0) {
8571       LLT SrcTy = MRI.getType(Src);
8572       auto Offset =
8573           MIB.buildConstant(LLT::scalar(SrcTy.getSizeInBits()), CurrOffset);
8574       LoadPtr = MIB.buildPtrAdd(SrcTy, Src, Offset).getReg(0);
8575     }
8576     LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
8577     CurrOffset += CopyTy.getSizeInBytes();
8578   }
8579 
8580   CurrOffset = 0;
8581   for (unsigned I = 0; I < MemOps.size(); ++I) {
8582     LLT CopyTy = MemOps[I];
8583     // Now store the values loaded.
8584     auto *StoreMMO =
8585         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
8586 
8587     Register StorePtr = Dst;
8588     if (CurrOffset != 0) {
8589       LLT DstTy = MRI.getType(Dst);
8590       auto Offset =
8591           MIB.buildConstant(LLT::scalar(DstTy.getSizeInBits()), CurrOffset);
8592       StorePtr = MIB.buildPtrAdd(DstTy, Dst, Offset).getReg(0);
8593     }
8594     MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
8595     CurrOffset += CopyTy.getSizeInBytes();
8596   }
8597   MI.eraseFromParent();
8598   return Legalized;
8599 }
8600 
8601 LegalizerHelper::LegalizeResult
8602 LegalizerHelper::lowerMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
8603   const unsigned Opc = MI.getOpcode();
8604   // This combine is fairly complex so it's not written with a separate
8605   // matcher function.
8606   assert((Opc == TargetOpcode::G_MEMCPY || Opc == TargetOpcode::G_MEMMOVE ||
8607           Opc == TargetOpcode::G_MEMSET) &&
8608          "Expected memcpy like instruction");
8609 
8610   auto MMOIt = MI.memoperands_begin();
8611   const MachineMemOperand *MemOp = *MMOIt;
8612 
8613   Align DstAlign = MemOp->getBaseAlign();
8614   Align SrcAlign;
8615   auto [Dst, Src, Len] = MI.getFirst3Regs();
8616 
8617   if (Opc != TargetOpcode::G_MEMSET) {
8618     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
8619     MemOp = *(++MMOIt);
8620     SrcAlign = MemOp->getBaseAlign();
8621   }
8622 
8623   // See if this is a constant length copy
8624   auto LenVRegAndVal = getIConstantVRegValWithLookThrough(Len, MRI);
8625   if (!LenVRegAndVal)
8626     return UnableToLegalize;
8627   uint64_t KnownLen = LenVRegAndVal->Value.getZExtValue();
8628 
8629   if (KnownLen == 0) {
8630     MI.eraseFromParent();
8631     return Legalized;
8632   }
8633 
8634   bool IsVolatile = MemOp->isVolatile();
8635   if (Opc == TargetOpcode::G_MEMCPY_INLINE)
8636     return lowerMemcpyInline(MI, Dst, Src, KnownLen, DstAlign, SrcAlign,
8637                              IsVolatile);
8638 
8639   // Don't try to optimize volatile.
8640   if (IsVolatile)
8641     return UnableToLegalize;
8642 
8643   if (MaxLen && KnownLen > MaxLen)
8644     return UnableToLegalize;
8645 
8646   if (Opc == TargetOpcode::G_MEMCPY) {
8647     auto &MF = *MI.getParent()->getParent();
8648     const auto &TLI = *MF.getSubtarget().getTargetLowering();
8649     bool OptSize = shouldLowerMemFuncForSize(MF);
8650     uint64_t Limit = TLI.getMaxStoresPerMemcpy(OptSize);
8651     return lowerMemcpy(MI, Dst, Src, KnownLen, Limit, DstAlign, SrcAlign,
8652                        IsVolatile);
8653   }
8654   if (Opc == TargetOpcode::G_MEMMOVE)
8655     return lowerMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
8656   if (Opc == TargetOpcode::G_MEMSET)
8657     return lowerMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
8658   return UnableToLegalize;
8659 }
8660