1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "MCTargetDesc/AArch64AddressingModes.h"
22 #include "MCTargetDesc/AArch64MCTargetDesc.h"
23 #include "llvm/ADT/Optional.h"
24 #include "llvm/BinaryFormat/Dwarf.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
30 #include "llvm/CodeGen/GlobalISel/Utils.h"
31 #include "llvm/CodeGen/MachineBasicBlock.h"
32 #include "llvm/CodeGen/MachineConstantPool.h"
33 #include "llvm/CodeGen/MachineFrameInfo.h"
34 #include "llvm/CodeGen/MachineFunction.h"
35 #include "llvm/CodeGen/MachineInstr.h"
36 #include "llvm/CodeGen/MachineInstrBuilder.h"
37 #include "llvm/CodeGen/MachineMemOperand.h"
38 #include "llvm/CodeGen/MachineOperand.h"
39 #include "llvm/CodeGen/MachineRegisterInfo.h"
40 #include "llvm/CodeGen/TargetOpcodes.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Instructions.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/IR/PatternMatch.h"
46 #include "llvm/IR/Type.h"
47 #include "llvm/Pass.h"
48 #include "llvm/Support/Debug.h"
49 #include "llvm/Support/raw_ostream.h"
50 
51 #define DEBUG_TYPE "aarch64-isel"
52 
53 using namespace llvm;
54 using namespace MIPatternMatch;
55 using namespace AArch64GISelUtils;
56 
57 namespace llvm {
58 class BlockFrequencyInfo;
59 class ProfileSummaryInfo;
60 }
61 
62 namespace {
63 
64 #define GET_GLOBALISEL_PREDICATE_BITSET
65 #include "AArch64GenGlobalISel.inc"
66 #undef GET_GLOBALISEL_PREDICATE_BITSET
67 
68 
69 class AArch64InstructionSelector : public InstructionSelector {
70 public:
71   AArch64InstructionSelector(const AArch64TargetMachine &TM,
72                              const AArch64Subtarget &STI,
73                              const AArch64RegisterBankInfo &RBI);
74 
75   bool select(MachineInstr &I) override;
76   static const char *getName() { return DEBUG_TYPE; }
77 
78   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
79                CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
80                BlockFrequencyInfo *BFI) override {
81     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
82     MIB.setMF(MF);
83 
84     // hasFnAttribute() is expensive to call on every BRCOND selection, so
85     // cache it here for each run of the selector.
86     ProduceNonFlagSettingCondBr =
87         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
88     MFReturnAddr = Register();
89 
90     processPHIs(MF);
91   }
92 
93 private:
94   /// tblgen-erated 'select' implementation, used as the initial selector for
95   /// the patterns that don't require complex C++.
96   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
97 
98   // A lowering phase that runs before any selection attempts.
99   // Returns true if the instruction was modified.
100   bool preISelLower(MachineInstr &I);
101 
102   // An early selection function that runs before the selectImpl() call.
103   bool earlySelect(MachineInstr &I);
104 
105   // Do some preprocessing of G_PHIs before we begin selection.
106   void processPHIs(MachineFunction &MF);
107 
108   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
109 
110   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
111   bool contractCrossBankCopyIntoStore(MachineInstr &I,
112                                       MachineRegisterInfo &MRI);
113 
114   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
115 
116   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
117                           MachineRegisterInfo &MRI) const;
118   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
119                            MachineRegisterInfo &MRI) const;
120 
121   ///@{
122   /// Helper functions for selectCompareBranch.
123   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
124                                     MachineIRBuilder &MIB) const;
125   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
126                                     MachineIRBuilder &MIB) const;
127   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
128                                     MachineIRBuilder &MIB) const;
129   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
130                                   MachineBasicBlock *DstMBB,
131                                   MachineIRBuilder &MIB) const;
132   ///@}
133 
134   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
135                            MachineRegisterInfo &MRI);
136 
137   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
138   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
139 
140   // Helper to generate an equivalent of scalar_to_vector into a new register,
141   // returned via 'Dst'.
142   MachineInstr *emitScalarToVector(unsigned EltSize,
143                                    const TargetRegisterClass *DstRC,
144                                    Register Scalar,
145                                    MachineIRBuilder &MIRBuilder) const;
146 
147   /// Emit a lane insert into \p DstReg, or a new vector register if None is
148   /// provided.
149   ///
150   /// The lane inserted into is defined by \p LaneIdx. The vector source
151   /// register is given by \p SrcReg. The register containing the element is
152   /// given by \p EltReg.
153   MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
154                                Register EltReg, unsigned LaneIdx,
155                                const RegisterBank &RB,
156                                MachineIRBuilder &MIRBuilder) const;
157 
158   /// Emit a sequence of instructions representing a constant \p CV for a
159   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
160   ///
161   /// \returns the last instruction in the sequence on success, and nullptr
162   /// otherwise.
163   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
164                                    MachineIRBuilder &MIRBuilder,
165                                    MachineRegisterInfo &MRI);
166 
167   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
168   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
169                               MachineRegisterInfo &MRI);
170   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
171   /// SUBREG_TO_REG.
172   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
173   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
174   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
175   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
176 
177   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
178   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
179   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
180   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
181 
182   /// Helper function to select vector load intrinsics like
183   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
184   /// \p Opc is the opcode that the selected instruction should use.
185   /// \p NumVecs is the number of vector destinations for the instruction.
186   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
187   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
188                                  MachineInstr &I);
189   bool selectIntrinsicWithSideEffects(MachineInstr &I,
190                                       MachineRegisterInfo &MRI);
191   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
192   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
193   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
194   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
195   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
196   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
197   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
198   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
199   bool selectMOPS(MachineInstr &I, MachineRegisterInfo &MRI);
200   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
201 
202   unsigned emitConstantPoolEntry(const Constant *CPVal,
203                                  MachineFunction &MF) const;
204   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
205                                          MachineIRBuilder &MIRBuilder) const;
206 
207   // Emit a vector concat operation.
208   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
209                                  Register Op2,
210                                  MachineIRBuilder &MIRBuilder) const;
211 
212   // Emit an integer compare between LHS and RHS, which checks for Predicate.
213   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
214                                    MachineOperand &Predicate,
215                                    MachineIRBuilder &MIRBuilder) const;
216 
217   /// Emit a floating point comparison between \p LHS and \p RHS.
218   /// \p Pred if given is the intended predicate to use.
219   MachineInstr *emitFPCompare(Register LHS, Register RHS,
220                               MachineIRBuilder &MIRBuilder,
221                               Optional<CmpInst::Predicate> = None) const;
222 
223   MachineInstr *emitInstr(unsigned Opcode,
224                           std::initializer_list<llvm::DstOp> DstOps,
225                           std::initializer_list<llvm::SrcOp> SrcOps,
226                           MachineIRBuilder &MIRBuilder,
227                           const ComplexRendererFns &RenderFns = None) const;
228   /// Helper function to emit an add or sub instruction.
229   ///
230   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
231   /// in a specific order.
232   ///
233   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
234   ///
235   /// \code
236   ///   const std::array<std::array<unsigned, 2>, 4> Table {
237   ///    {{AArch64::ADDXri, AArch64::ADDWri},
238   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
239   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
240   ///     {AArch64::SUBXri, AArch64::SUBWri},
241   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
242   /// \endcode
243   ///
244   /// Each row in the table corresponds to a different addressing mode. Each
245   /// column corresponds to a different register size.
246   ///
247   /// \attention Rows must be structured as follows:
248   ///   - Row 0: The ri opcode variants
249   ///   - Row 1: The rs opcode variants
250   ///   - Row 2: The rr opcode variants
251   ///   - Row 3: The ri opcode variants for negative immediates
252   ///   - Row 4: The rx opcode variants
253   ///
254   /// \attention Columns must be structured as follows:
255   ///   - Column 0: The 64-bit opcode variants
256   ///   - Column 1: The 32-bit opcode variants
257   ///
258   /// \p Dst is the destination register of the binop to emit.
259   /// \p LHS is the left-hand operand of the binop to emit.
260   /// \p RHS is the right-hand operand of the binop to emit.
261   MachineInstr *emitAddSub(
262       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
263       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
264       MachineIRBuilder &MIRBuilder) const;
265   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
266                         MachineOperand &RHS,
267                         MachineIRBuilder &MIRBuilder) const;
268   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
269                          MachineIRBuilder &MIRBuilder) const;
270   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
271                          MachineIRBuilder &MIRBuilder) const;
272   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
273                         MachineIRBuilder &MIRBuilder) const;
274   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
275                         MachineIRBuilder &MIRBuilder) const;
276   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
277                            AArch64CC::CondCode CC,
278                            MachineIRBuilder &MIRBuilder) const;
279   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
280                                      const RegisterBank &DstRB, LLT ScalarTy,
281                                      Register VecReg, unsigned LaneIdx,
282                                      MachineIRBuilder &MIRBuilder) const;
283   MachineInstr *emitCSINC(Register Dst, Register Src1, Register Src2,
284                           AArch64CC::CondCode Pred,
285                           MachineIRBuilder &MIRBuilder) const;
286   /// Emit a CSet for a FP compare.
287   ///
288   /// \p Dst is expected to be a 32-bit scalar register.
289   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
290                                 MachineIRBuilder &MIRBuilder) const;
291 
292   /// Emit the overflow op for \p Opcode.
293   ///
294   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
295   /// G_USUBO, etc.
296   std::pair<MachineInstr *, AArch64CC::CondCode>
297   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
298                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
299 
300   /// Emit expression as a conjunction (a series of CCMP/CFCMP ops).
301   /// In some cases this is even possible with OR operations in the expression.
302   MachineInstr *emitConjunction(Register Val, AArch64CC::CondCode &OutCC,
303                                 MachineIRBuilder &MIB) const;
304   MachineInstr *emitConditionalComparison(Register LHS, Register RHS,
305                                           CmpInst::Predicate CC,
306                                           AArch64CC::CondCode Predicate,
307                                           AArch64CC::CondCode OutCC,
308                                           MachineIRBuilder &MIB) const;
309   MachineInstr *emitConjunctionRec(Register Val, AArch64CC::CondCode &OutCC,
310                                    bool Negate, Register CCOp,
311                                    AArch64CC::CondCode Predicate,
312                                    MachineIRBuilder &MIB) const;
313 
314   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
315   /// \p IsNegative is true if the test should be "not zero".
316   /// This will also optimize the test bit instruction when possible.
317   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
318                             MachineBasicBlock *DstMBB,
319                             MachineIRBuilder &MIB) const;
320 
321   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
322   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
323                         MachineBasicBlock *DestMBB,
324                         MachineIRBuilder &MIB) const;
325 
326   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
327   // We use these manually instead of using the importer since it doesn't
328   // support SDNodeXForm.
329   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
330   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
331   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
332   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
333 
334   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
335   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
336   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
337 
338   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
339                                             unsigned Size) const;
340 
341   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
342     return selectAddrModeUnscaled(Root, 1);
343   }
344   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
345     return selectAddrModeUnscaled(Root, 2);
346   }
347   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
348     return selectAddrModeUnscaled(Root, 4);
349   }
350   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
351     return selectAddrModeUnscaled(Root, 8);
352   }
353   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
354     return selectAddrModeUnscaled(Root, 16);
355   }
356 
357   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
358   /// from complex pattern matchers like selectAddrModeIndexed().
359   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
360                                           MachineRegisterInfo &MRI) const;
361 
362   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
363                                            unsigned Size) const;
364   template <int Width>
365   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
366     return selectAddrModeIndexed(Root, Width / 8);
367   }
368 
369   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
370                                      const MachineRegisterInfo &MRI) const;
371   ComplexRendererFns
372   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
373                                   unsigned SizeInBytes) const;
374 
375   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
376   /// or not a shift + extend should be folded into an addressing mode. Returns
377   /// None when this is not profitable or possible.
378   ComplexRendererFns
379   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
380                     MachineOperand &Offset, unsigned SizeInBytes,
381                     bool WantsExt) const;
382   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
383   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
384                                        unsigned SizeInBytes) const;
385   template <int Width>
386   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
387     return selectAddrModeXRO(Root, Width / 8);
388   }
389 
390   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
391                                        unsigned SizeInBytes) const;
392   template <int Width>
393   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
394     return selectAddrModeWRO(Root, Width / 8);
395   }
396 
397   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
398                                            bool AllowROR = false) const;
399 
400   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
401     return selectShiftedRegister(Root);
402   }
403 
404   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
405     return selectShiftedRegister(Root, true);
406   }
407 
408   /// Given an extend instruction, determine the correct shift-extend type for
409   /// that instruction.
410   ///
411   /// If the instruction is going to be used in a load or store, pass
412   /// \p IsLoadStore = true.
413   AArch64_AM::ShiftExtendType
414   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
415                        bool IsLoadStore = false) const;
416 
417   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
418   ///
419   /// \returns Either \p Reg if no change was necessary, or the new register
420   /// created by moving \p Reg.
421   ///
422   /// Note: This uses emitCopy right now.
423   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
424                               MachineIRBuilder &MIB) const;
425 
426   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
427 
428   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
429                       int OpIdx = -1) const;
430   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
431                           int OpIdx = -1) const;
432   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
433                           int OpIdx = -1) const;
434   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
435                      int OpIdx = -1) const;
436   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
437                      int OpIdx = -1) const;
438   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
439                      int OpIdx = -1) const;
440   void renderFPImm32SIMDModImmType4(MachineInstrBuilder &MIB,
441                                     const MachineInstr &MI,
442                                     int OpIdx = -1) const;
443 
444   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
445   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
446 
447   // Optimization methods.
448   bool tryOptSelect(GSelect &Sel);
449   bool tryOptSelectConjunction(GSelect &Sel, MachineInstr &CondMI);
450   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
451                                       MachineOperand &Predicate,
452                                       MachineIRBuilder &MIRBuilder) const;
453 
454   /// Return true if \p MI is a load or store of \p NumBytes bytes.
455   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
456 
457   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
458   /// register zeroed out. In other words, the result of MI has been explicitly
459   /// zero extended.
460   bool isDef32(const MachineInstr &MI) const;
461 
462   const AArch64TargetMachine &TM;
463   const AArch64Subtarget &STI;
464   const AArch64InstrInfo &TII;
465   const AArch64RegisterInfo &TRI;
466   const AArch64RegisterBankInfo &RBI;
467 
468   bool ProduceNonFlagSettingCondBr = false;
469 
470   // Some cached values used during selection.
471   // We use LR as a live-in register, and we keep track of it here as it can be
472   // clobbered by calls.
473   Register MFReturnAddr;
474 
475   MachineIRBuilder MIB;
476 
477 #define GET_GLOBALISEL_PREDICATES_DECL
478 #include "AArch64GenGlobalISel.inc"
479 #undef GET_GLOBALISEL_PREDICATES_DECL
480 
481 // We declare the temporaries used by selectImpl() in the class to minimize the
482 // cost of constructing placeholder values.
483 #define GET_GLOBALISEL_TEMPORARIES_DECL
484 #include "AArch64GenGlobalISel.inc"
485 #undef GET_GLOBALISEL_TEMPORARIES_DECL
486 };
487 
488 } // end anonymous namespace
489 
490 #define GET_GLOBALISEL_IMPL
491 #include "AArch64GenGlobalISel.inc"
492 #undef GET_GLOBALISEL_IMPL
493 
494 AArch64InstructionSelector::AArch64InstructionSelector(
495     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
496     const AArch64RegisterBankInfo &RBI)
497     : TM(TM), STI(STI), TII(*STI.getInstrInfo()), TRI(*STI.getRegisterInfo()),
498       RBI(RBI),
499 #define GET_GLOBALISEL_PREDICATES_INIT
500 #include "AArch64GenGlobalISel.inc"
501 #undef GET_GLOBALISEL_PREDICATES_INIT
502 #define GET_GLOBALISEL_TEMPORARIES_INIT
503 #include "AArch64GenGlobalISel.inc"
504 #undef GET_GLOBALISEL_TEMPORARIES_INIT
505 {
506 }
507 
508 // FIXME: This should be target-independent, inferred from the types declared
509 // for each class in the bank.
510 //
511 /// Given a register bank, and a type, return the smallest register class that
512 /// can represent that combination.
513 static const TargetRegisterClass *
514 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
515                          bool GetAllRegSet = false) {
516   if (RB.getID() == AArch64::GPRRegBankID) {
517     if (Ty.getSizeInBits() <= 32)
518       return GetAllRegSet ? &AArch64::GPR32allRegClass
519                           : &AArch64::GPR32RegClass;
520     if (Ty.getSizeInBits() == 64)
521       return GetAllRegSet ? &AArch64::GPR64allRegClass
522                           : &AArch64::GPR64RegClass;
523     if (Ty.getSizeInBits() == 128)
524       return &AArch64::XSeqPairsClassRegClass;
525     return nullptr;
526   }
527 
528   if (RB.getID() == AArch64::FPRRegBankID) {
529     switch (Ty.getSizeInBits()) {
530     case 8:
531       return &AArch64::FPR8RegClass;
532     case 16:
533       return &AArch64::FPR16RegClass;
534     case 32:
535       return &AArch64::FPR32RegClass;
536     case 64:
537       return &AArch64::FPR64RegClass;
538     case 128:
539       return &AArch64::FPR128RegClass;
540     }
541     return nullptr;
542   }
543 
544   return nullptr;
545 }
546 
547 /// Given a register bank, and size in bits, return the smallest register class
548 /// that can represent that combination.
549 static const TargetRegisterClass *
550 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
551                       bool GetAllRegSet = false) {
552   unsigned RegBankID = RB.getID();
553 
554   if (RegBankID == AArch64::GPRRegBankID) {
555     if (SizeInBits <= 32)
556       return GetAllRegSet ? &AArch64::GPR32allRegClass
557                           : &AArch64::GPR32RegClass;
558     if (SizeInBits == 64)
559       return GetAllRegSet ? &AArch64::GPR64allRegClass
560                           : &AArch64::GPR64RegClass;
561     if (SizeInBits == 128)
562       return &AArch64::XSeqPairsClassRegClass;
563   }
564 
565   if (RegBankID == AArch64::FPRRegBankID) {
566     switch (SizeInBits) {
567     default:
568       return nullptr;
569     case 8:
570       return &AArch64::FPR8RegClass;
571     case 16:
572       return &AArch64::FPR16RegClass;
573     case 32:
574       return &AArch64::FPR32RegClass;
575     case 64:
576       return &AArch64::FPR64RegClass;
577     case 128:
578       return &AArch64::FPR128RegClass;
579     }
580   }
581 
582   return nullptr;
583 }
584 
585 /// Returns the correct subregister to use for a given register class.
586 static bool getSubRegForClass(const TargetRegisterClass *RC,
587                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
588   switch (TRI.getRegSizeInBits(*RC)) {
589   case 8:
590     SubReg = AArch64::bsub;
591     break;
592   case 16:
593     SubReg = AArch64::hsub;
594     break;
595   case 32:
596     if (RC != &AArch64::FPR32RegClass)
597       SubReg = AArch64::sub_32;
598     else
599       SubReg = AArch64::ssub;
600     break;
601   case 64:
602     SubReg = AArch64::dsub;
603     break;
604   default:
605     LLVM_DEBUG(
606         dbgs() << "Couldn't find appropriate subregister for register class.");
607     return false;
608   }
609 
610   return true;
611 }
612 
613 /// Returns the minimum size the given register bank can hold.
614 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
615   switch (RB.getID()) {
616   case AArch64::GPRRegBankID:
617     return 32;
618   case AArch64::FPRRegBankID:
619     return 8;
620   default:
621     llvm_unreachable("Tried to get minimum size for unknown register bank.");
622   }
623 }
624 
625 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
626 /// Helper function for functions like createDTuple and createQTuple.
627 ///
628 /// \p RegClassIDs - The list of register class IDs available for some tuple of
629 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
630 /// expected to contain between 2 and 4 tuple classes.
631 ///
632 /// \p SubRegs - The list of subregister classes associated with each register
633 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
634 /// subregister class. The index of each subregister class is expected to
635 /// correspond with the index of each register class.
636 ///
637 /// \returns Either the destination register of REG_SEQUENCE instruction that
638 /// was created, or the 0th element of \p Regs if \p Regs contains a single
639 /// element.
640 static Register createTuple(ArrayRef<Register> Regs,
641                             const unsigned RegClassIDs[],
642                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
643   unsigned NumRegs = Regs.size();
644   if (NumRegs == 1)
645     return Regs[0];
646   assert(NumRegs >= 2 && NumRegs <= 4 &&
647          "Only support between two and 4 registers in a tuple!");
648   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
649   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
650   auto RegSequence =
651       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
652   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
653     RegSequence.addUse(Regs[I]);
654     RegSequence.addImm(SubRegs[I]);
655   }
656   return RegSequence.getReg(0);
657 }
658 
659 /// Create a tuple of D-registers using the registers in \p Regs.
660 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
661   static const unsigned RegClassIDs[] = {
662       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
663   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
664                                      AArch64::dsub2, AArch64::dsub3};
665   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
666 }
667 
668 /// Create a tuple of Q-registers using the registers in \p Regs.
669 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
670   static const unsigned RegClassIDs[] = {
671       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
672   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
673                                      AArch64::qsub2, AArch64::qsub3};
674   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
675 }
676 
677 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
678   auto &MI = *Root.getParent();
679   auto &MBB = *MI.getParent();
680   auto &MF = *MBB.getParent();
681   auto &MRI = MF.getRegInfo();
682   uint64_t Immed;
683   if (Root.isImm())
684     Immed = Root.getImm();
685   else if (Root.isCImm())
686     Immed = Root.getCImm()->getZExtValue();
687   else if (Root.isReg()) {
688     auto ValAndVReg =
689         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
690     if (!ValAndVReg)
691       return None;
692     Immed = ValAndVReg->Value.getSExtValue();
693   } else
694     return None;
695   return Immed;
696 }
697 
698 /// Check whether \p I is a currently unsupported binary operation:
699 /// - it has an unsized type
700 /// - an operand is not a vreg
701 /// - all operands are not in the same bank
702 /// These are checks that should someday live in the verifier, but right now,
703 /// these are mostly limitations of the aarch64 selector.
704 static bool unsupportedBinOp(const MachineInstr &I,
705                              const AArch64RegisterBankInfo &RBI,
706                              const MachineRegisterInfo &MRI,
707                              const AArch64RegisterInfo &TRI) {
708   LLT Ty = MRI.getType(I.getOperand(0).getReg());
709   if (!Ty.isValid()) {
710     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
711     return true;
712   }
713 
714   const RegisterBank *PrevOpBank = nullptr;
715   for (auto &MO : I.operands()) {
716     // FIXME: Support non-register operands.
717     if (!MO.isReg()) {
718       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
719       return true;
720     }
721 
722     // FIXME: Can generic operations have physical registers operands? If
723     // so, this will need to be taught about that, and we'll need to get the
724     // bank out of the minimal class for the register.
725     // Either way, this needs to be documented (and possibly verified).
726     if (!Register::isVirtualRegister(MO.getReg())) {
727       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
728       return true;
729     }
730 
731     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
732     if (!OpBank) {
733       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
734       return true;
735     }
736 
737     if (PrevOpBank && OpBank != PrevOpBank) {
738       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
739       return true;
740     }
741     PrevOpBank = OpBank;
742   }
743   return false;
744 }
745 
746 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
747 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
748 /// and of size \p OpSize.
749 /// \returns \p GenericOpc if the combination is unsupported.
750 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
751                                unsigned OpSize) {
752   switch (RegBankID) {
753   case AArch64::GPRRegBankID:
754     if (OpSize == 32) {
755       switch (GenericOpc) {
756       case TargetOpcode::G_SHL:
757         return AArch64::LSLVWr;
758       case TargetOpcode::G_LSHR:
759         return AArch64::LSRVWr;
760       case TargetOpcode::G_ASHR:
761         return AArch64::ASRVWr;
762       default:
763         return GenericOpc;
764       }
765     } else if (OpSize == 64) {
766       switch (GenericOpc) {
767       case TargetOpcode::G_PTR_ADD:
768         return AArch64::ADDXrr;
769       case TargetOpcode::G_SHL:
770         return AArch64::LSLVXr;
771       case TargetOpcode::G_LSHR:
772         return AArch64::LSRVXr;
773       case TargetOpcode::G_ASHR:
774         return AArch64::ASRVXr;
775       default:
776         return GenericOpc;
777       }
778     }
779     break;
780   case AArch64::FPRRegBankID:
781     switch (OpSize) {
782     case 32:
783       switch (GenericOpc) {
784       case TargetOpcode::G_FADD:
785         return AArch64::FADDSrr;
786       case TargetOpcode::G_FSUB:
787         return AArch64::FSUBSrr;
788       case TargetOpcode::G_FMUL:
789         return AArch64::FMULSrr;
790       case TargetOpcode::G_FDIV:
791         return AArch64::FDIVSrr;
792       default:
793         return GenericOpc;
794       }
795     case 64:
796       switch (GenericOpc) {
797       case TargetOpcode::G_FADD:
798         return AArch64::FADDDrr;
799       case TargetOpcode::G_FSUB:
800         return AArch64::FSUBDrr;
801       case TargetOpcode::G_FMUL:
802         return AArch64::FMULDrr;
803       case TargetOpcode::G_FDIV:
804         return AArch64::FDIVDrr;
805       case TargetOpcode::G_OR:
806         return AArch64::ORRv8i8;
807       default:
808         return GenericOpc;
809       }
810     }
811     break;
812   }
813   return GenericOpc;
814 }
815 
816 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
817 /// appropriate for the (value) register bank \p RegBankID and of memory access
818 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
819 /// addressing mode (e.g., LDRXui).
820 /// \returns \p GenericOpc if the combination is unsupported.
821 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
822                                     unsigned OpSize) {
823   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
824   switch (RegBankID) {
825   case AArch64::GPRRegBankID:
826     switch (OpSize) {
827     case 8:
828       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
829     case 16:
830       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
831     case 32:
832       return isStore ? AArch64::STRWui : AArch64::LDRWui;
833     case 64:
834       return isStore ? AArch64::STRXui : AArch64::LDRXui;
835     }
836     break;
837   case AArch64::FPRRegBankID:
838     switch (OpSize) {
839     case 8:
840       return isStore ? AArch64::STRBui : AArch64::LDRBui;
841     case 16:
842       return isStore ? AArch64::STRHui : AArch64::LDRHui;
843     case 32:
844       return isStore ? AArch64::STRSui : AArch64::LDRSui;
845     case 64:
846       return isStore ? AArch64::STRDui : AArch64::LDRDui;
847     case 128:
848       return isStore ? AArch64::STRQui : AArch64::LDRQui;
849     }
850     break;
851   }
852   return GenericOpc;
853 }
854 
855 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
856 /// to \p *To.
857 ///
858 /// E.g "To = COPY SrcReg:SubReg"
859 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
860                        const RegisterBankInfo &RBI, Register SrcReg,
861                        const TargetRegisterClass *To, unsigned SubReg) {
862   assert(SrcReg.isValid() && "Expected a valid source register?");
863   assert(To && "Destination register class cannot be null");
864   assert(SubReg && "Expected a valid subregister");
865 
866   MachineIRBuilder MIB(I);
867   auto SubRegCopy =
868       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
869   MachineOperand &RegOp = I.getOperand(1);
870   RegOp.setReg(SubRegCopy.getReg(0));
871 
872   // It's possible that the destination register won't be constrained. Make
873   // sure that happens.
874   if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
875     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
876 
877   return true;
878 }
879 
880 /// Helper function to get the source and destination register classes for a
881 /// copy. Returns a std::pair containing the source register class for the
882 /// copy, and the destination register class for the copy. If a register class
883 /// cannot be determined, then it will be nullptr.
884 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
885 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
886                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
887                      const RegisterBankInfo &RBI) {
888   Register DstReg = I.getOperand(0).getReg();
889   Register SrcReg = I.getOperand(1).getReg();
890   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
891   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
892   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
893   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
894 
895   // Special casing for cross-bank copies of s1s. We can technically represent
896   // a 1-bit value with any size of register. The minimum size for a GPR is 32
897   // bits. So, we need to put the FPR on 32 bits as well.
898   //
899   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
900   // then we can pull it into the helpers that get the appropriate class for a
901   // register bank. Or make a new helper that carries along some constraint
902   // information.
903   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
904     SrcSize = DstSize = 32;
905 
906   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
907           getMinClassForRegBank(DstRegBank, DstSize, true)};
908 }
909 
910 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
911                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
912                        const RegisterBankInfo &RBI) {
913   Register DstReg = I.getOperand(0).getReg();
914   Register SrcReg = I.getOperand(1).getReg();
915   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
916   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
917 
918   // Find the correct register classes for the source and destination registers.
919   const TargetRegisterClass *SrcRC;
920   const TargetRegisterClass *DstRC;
921   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
922 
923   if (!DstRC) {
924     LLVM_DEBUG(dbgs() << "Unexpected dest size "
925                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
926     return false;
927   }
928 
929   // Is this a copy? If so, then we may need to insert a subregister copy.
930   if (I.isCopy()) {
931     // Yes. Check if there's anything to fix up.
932     if (!SrcRC) {
933       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
934       return false;
935     }
936 
937     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
938     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
939     unsigned SubReg;
940 
941     // If the source bank doesn't support a subregister copy small enough,
942     // then we first need to copy to the destination bank.
943     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
944       const TargetRegisterClass *DstTempRC =
945           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
946       getSubRegForClass(DstRC, TRI, SubReg);
947 
948       MachineIRBuilder MIB(I);
949       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
950       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
951     } else if (SrcSize > DstSize) {
952       // If the source register is bigger than the destination we need to
953       // perform a subregister copy.
954       const TargetRegisterClass *SubRegRC =
955           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
956       getSubRegForClass(SubRegRC, TRI, SubReg);
957       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
958     } else if (DstSize > SrcSize) {
959       // If the destination register is bigger than the source we need to do
960       // a promotion using SUBREG_TO_REG.
961       const TargetRegisterClass *PromotionRC =
962           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
963       getSubRegForClass(SrcRC, TRI, SubReg);
964 
965       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
966       BuildMI(*I.getParent(), I, I.getDebugLoc(),
967               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
968           .addImm(0)
969           .addUse(SrcReg)
970           .addImm(SubReg);
971       MachineOperand &RegOp = I.getOperand(1);
972       RegOp.setReg(PromoteReg);
973     }
974 
975     // If the destination is a physical register, then there's nothing to
976     // change, so we're done.
977     if (Register::isPhysicalRegister(DstReg))
978       return true;
979   }
980 
981   // No need to constrain SrcReg. It will get constrained when we hit another
982   // of its use or its defs. Copies do not have constraints.
983   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
984     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
985                       << " operand\n");
986     return false;
987   }
988 
989   // If this a GPR ZEXT that we want to just reduce down into a copy.
990   // The sizes will be mismatched with the source < 32b but that's ok.
991   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
992     I.setDesc(TII.get(AArch64::COPY));
993     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
994     return selectCopy(I, TII, MRI, TRI, RBI);
995   }
996 
997   I.setDesc(TII.get(AArch64::COPY));
998   return true;
999 }
1000 
1001 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1002   if (!DstTy.isScalar() || !SrcTy.isScalar())
1003     return GenericOpc;
1004 
1005   const unsigned DstSize = DstTy.getSizeInBits();
1006   const unsigned SrcSize = SrcTy.getSizeInBits();
1007 
1008   switch (DstSize) {
1009   case 32:
1010     switch (SrcSize) {
1011     case 32:
1012       switch (GenericOpc) {
1013       case TargetOpcode::G_SITOFP:
1014         return AArch64::SCVTFUWSri;
1015       case TargetOpcode::G_UITOFP:
1016         return AArch64::UCVTFUWSri;
1017       case TargetOpcode::G_FPTOSI:
1018         return AArch64::FCVTZSUWSr;
1019       case TargetOpcode::G_FPTOUI:
1020         return AArch64::FCVTZUUWSr;
1021       default:
1022         return GenericOpc;
1023       }
1024     case 64:
1025       switch (GenericOpc) {
1026       case TargetOpcode::G_SITOFP:
1027         return AArch64::SCVTFUXSri;
1028       case TargetOpcode::G_UITOFP:
1029         return AArch64::UCVTFUXSri;
1030       case TargetOpcode::G_FPTOSI:
1031         return AArch64::FCVTZSUWDr;
1032       case TargetOpcode::G_FPTOUI:
1033         return AArch64::FCVTZUUWDr;
1034       default:
1035         return GenericOpc;
1036       }
1037     default:
1038       return GenericOpc;
1039     }
1040   case 64:
1041     switch (SrcSize) {
1042     case 32:
1043       switch (GenericOpc) {
1044       case TargetOpcode::G_SITOFP:
1045         return AArch64::SCVTFUWDri;
1046       case TargetOpcode::G_UITOFP:
1047         return AArch64::UCVTFUWDri;
1048       case TargetOpcode::G_FPTOSI:
1049         return AArch64::FCVTZSUXSr;
1050       case TargetOpcode::G_FPTOUI:
1051         return AArch64::FCVTZUUXSr;
1052       default:
1053         return GenericOpc;
1054       }
1055     case 64:
1056       switch (GenericOpc) {
1057       case TargetOpcode::G_SITOFP:
1058         return AArch64::SCVTFUXDri;
1059       case TargetOpcode::G_UITOFP:
1060         return AArch64::UCVTFUXDri;
1061       case TargetOpcode::G_FPTOSI:
1062         return AArch64::FCVTZSUXDr;
1063       case TargetOpcode::G_FPTOUI:
1064         return AArch64::FCVTZUUXDr;
1065       default:
1066         return GenericOpc;
1067       }
1068     default:
1069       return GenericOpc;
1070     }
1071   default:
1072     return GenericOpc;
1073   };
1074   return GenericOpc;
1075 }
1076 
1077 MachineInstr *
1078 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1079                                        Register False, AArch64CC::CondCode CC,
1080                                        MachineIRBuilder &MIB) const {
1081   MachineRegisterInfo &MRI = *MIB.getMRI();
1082   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1083              RBI.getRegBank(True, MRI, TRI)->getID() &&
1084          "Expected both select operands to have the same regbank?");
1085   LLT Ty = MRI.getType(True);
1086   if (Ty.isVector())
1087     return nullptr;
1088   const unsigned Size = Ty.getSizeInBits();
1089   assert((Size == 32 || Size == 64) &&
1090          "Expected 32 bit or 64 bit select only?");
1091   const bool Is32Bit = Size == 32;
1092   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1093     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1094     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1095     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1096     return &*FCSel;
1097   }
1098 
1099   // By default, we'll try and emit a CSEL.
1100   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1101   bool Optimized = false;
1102   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1103                                  &Optimized](Register &Reg, Register &OtherReg,
1104                                              bool Invert) {
1105     if (Optimized)
1106       return false;
1107 
1108     // Attempt to fold:
1109     //
1110     // %sub = G_SUB 0, %x
1111     // %select = G_SELECT cc, %reg, %sub
1112     //
1113     // Into:
1114     // %select = CSNEG %reg, %x, cc
1115     Register MatchReg;
1116     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1117       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1118       Reg = MatchReg;
1119       if (Invert) {
1120         CC = AArch64CC::getInvertedCondCode(CC);
1121         std::swap(Reg, OtherReg);
1122       }
1123       return true;
1124     }
1125 
1126     // Attempt to fold:
1127     //
1128     // %xor = G_XOR %x, -1
1129     // %select = G_SELECT cc, %reg, %xor
1130     //
1131     // Into:
1132     // %select = CSINV %reg, %x, cc
1133     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1134       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1135       Reg = MatchReg;
1136       if (Invert) {
1137         CC = AArch64CC::getInvertedCondCode(CC);
1138         std::swap(Reg, OtherReg);
1139       }
1140       return true;
1141     }
1142 
1143     // Attempt to fold:
1144     //
1145     // %add = G_ADD %x, 1
1146     // %select = G_SELECT cc, %reg, %add
1147     //
1148     // Into:
1149     // %select = CSINC %reg, %x, cc
1150     if (mi_match(Reg, MRI,
1151                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1152                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1153       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1154       Reg = MatchReg;
1155       if (Invert) {
1156         CC = AArch64CC::getInvertedCondCode(CC);
1157         std::swap(Reg, OtherReg);
1158       }
1159       return true;
1160     }
1161 
1162     return false;
1163   };
1164 
1165   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1166   // true/false values are constants.
1167   // FIXME: All of these patterns already exist in tablegen. We should be
1168   // able to import these.
1169   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1170                           &Optimized]() {
1171     if (Optimized)
1172       return false;
1173     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1174     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1175     if (!TrueCst && !FalseCst)
1176       return false;
1177 
1178     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1179     if (TrueCst && FalseCst) {
1180       int64_t T = TrueCst->Value.getSExtValue();
1181       int64_t F = FalseCst->Value.getSExtValue();
1182 
1183       if (T == 0 && F == 1) {
1184         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1185         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1186         True = ZReg;
1187         False = ZReg;
1188         return true;
1189       }
1190 
1191       if (T == 0 && F == -1) {
1192         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1193         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1194         True = ZReg;
1195         False = ZReg;
1196         return true;
1197       }
1198     }
1199 
1200     if (TrueCst) {
1201       int64_t T = TrueCst->Value.getSExtValue();
1202       if (T == 1) {
1203         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1204         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1205         True = False;
1206         False = ZReg;
1207         CC = AArch64CC::getInvertedCondCode(CC);
1208         return true;
1209       }
1210 
1211       if (T == -1) {
1212         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1213         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1214         True = False;
1215         False = ZReg;
1216         CC = AArch64CC::getInvertedCondCode(CC);
1217         return true;
1218       }
1219     }
1220 
1221     if (FalseCst) {
1222       int64_t F = FalseCst->Value.getSExtValue();
1223       if (F == 1) {
1224         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1225         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1226         False = ZReg;
1227         return true;
1228       }
1229 
1230       if (F == -1) {
1231         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1232         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1233         False = ZReg;
1234         return true;
1235       }
1236     }
1237     return false;
1238   };
1239 
1240   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1241   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1242   Optimized |= TryOptSelectCst();
1243   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1244   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1245   return &*SelectInst;
1246 }
1247 
1248 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1249   switch (P) {
1250   default:
1251     llvm_unreachable("Unknown condition code!");
1252   case CmpInst::ICMP_NE:
1253     return AArch64CC::NE;
1254   case CmpInst::ICMP_EQ:
1255     return AArch64CC::EQ;
1256   case CmpInst::ICMP_SGT:
1257     return AArch64CC::GT;
1258   case CmpInst::ICMP_SGE:
1259     return AArch64CC::GE;
1260   case CmpInst::ICMP_SLT:
1261     return AArch64CC::LT;
1262   case CmpInst::ICMP_SLE:
1263     return AArch64CC::LE;
1264   case CmpInst::ICMP_UGT:
1265     return AArch64CC::HI;
1266   case CmpInst::ICMP_UGE:
1267     return AArch64CC::HS;
1268   case CmpInst::ICMP_ULT:
1269     return AArch64CC::LO;
1270   case CmpInst::ICMP_ULE:
1271     return AArch64CC::LS;
1272   }
1273 }
1274 
1275 /// changeFPCCToORAArch64CC - Convert an IR fp condition code to an AArch64 CC.
1276 static void changeFPCCToORAArch64CC(CmpInst::Predicate CC,
1277                                     AArch64CC::CondCode &CondCode,
1278                                     AArch64CC::CondCode &CondCode2) {
1279   CondCode2 = AArch64CC::AL;
1280   switch (CC) {
1281   default:
1282     llvm_unreachable("Unknown FP condition!");
1283   case CmpInst::FCMP_OEQ:
1284     CondCode = AArch64CC::EQ;
1285     break;
1286   case CmpInst::FCMP_OGT:
1287     CondCode = AArch64CC::GT;
1288     break;
1289   case CmpInst::FCMP_OGE:
1290     CondCode = AArch64CC::GE;
1291     break;
1292   case CmpInst::FCMP_OLT:
1293     CondCode = AArch64CC::MI;
1294     break;
1295   case CmpInst::FCMP_OLE:
1296     CondCode = AArch64CC::LS;
1297     break;
1298   case CmpInst::FCMP_ONE:
1299     CondCode = AArch64CC::MI;
1300     CondCode2 = AArch64CC::GT;
1301     break;
1302   case CmpInst::FCMP_ORD:
1303     CondCode = AArch64CC::VC;
1304     break;
1305   case CmpInst::FCMP_UNO:
1306     CondCode = AArch64CC::VS;
1307     break;
1308   case CmpInst::FCMP_UEQ:
1309     CondCode = AArch64CC::EQ;
1310     CondCode2 = AArch64CC::VS;
1311     break;
1312   case CmpInst::FCMP_UGT:
1313     CondCode = AArch64CC::HI;
1314     break;
1315   case CmpInst::FCMP_UGE:
1316     CondCode = AArch64CC::PL;
1317     break;
1318   case CmpInst::FCMP_ULT:
1319     CondCode = AArch64CC::LT;
1320     break;
1321   case CmpInst::FCMP_ULE:
1322     CondCode = AArch64CC::LE;
1323     break;
1324   case CmpInst::FCMP_UNE:
1325     CondCode = AArch64CC::NE;
1326     break;
1327   }
1328 }
1329 
1330 /// Convert an IR fp condition code to an AArch64 CC.
1331 /// This differs from changeFPCCToAArch64CC in that it returns cond codes that
1332 /// should be AND'ed instead of OR'ed.
1333 static void changeFPCCToANDAArch64CC(CmpInst::Predicate CC,
1334                                      AArch64CC::CondCode &CondCode,
1335                                      AArch64CC::CondCode &CondCode2) {
1336   CondCode2 = AArch64CC::AL;
1337   switch (CC) {
1338   default:
1339     changeFPCCToORAArch64CC(CC, CondCode, CondCode2);
1340     assert(CondCode2 == AArch64CC::AL);
1341     break;
1342   case CmpInst::FCMP_ONE:
1343     // (a one b)
1344     // == ((a olt b) || (a ogt b))
1345     // == ((a ord b) && (a une b))
1346     CondCode = AArch64CC::VC;
1347     CondCode2 = AArch64CC::NE;
1348     break;
1349   case CmpInst::FCMP_UEQ:
1350     // (a ueq b)
1351     // == ((a uno b) || (a oeq b))
1352     // == ((a ule b) && (a uge b))
1353     CondCode = AArch64CC::PL;
1354     CondCode2 = AArch64CC::LE;
1355     break;
1356   }
1357 }
1358 
1359 /// Return a register which can be used as a bit to test in a TB(N)Z.
1360 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1361                               MachineRegisterInfo &MRI) {
1362   assert(Reg.isValid() && "Expected valid register!");
1363   bool HasZext = false;
1364   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1365     unsigned Opc = MI->getOpcode();
1366 
1367     if (!MI->getOperand(0).isReg() ||
1368         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1369       break;
1370 
1371     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1372     //
1373     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1374     // on the truncated x is the same as the bit number on x.
1375     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1376         Opc == TargetOpcode::G_TRUNC) {
1377       if (Opc == TargetOpcode::G_ZEXT)
1378         HasZext = true;
1379 
1380       Register NextReg = MI->getOperand(1).getReg();
1381       // Did we find something worth folding?
1382       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1383         break;
1384 
1385       // NextReg is worth folding. Keep looking.
1386       Reg = NextReg;
1387       continue;
1388     }
1389 
1390     // Attempt to find a suitable operation with a constant on one side.
1391     Optional<uint64_t> C;
1392     Register TestReg;
1393     switch (Opc) {
1394     default:
1395       break;
1396     case TargetOpcode::G_AND:
1397     case TargetOpcode::G_XOR: {
1398       TestReg = MI->getOperand(1).getReg();
1399       Register ConstantReg = MI->getOperand(2).getReg();
1400       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1401       if (!VRegAndVal) {
1402         // AND commutes, check the other side for a constant.
1403         // FIXME: Can we canonicalize the constant so that it's always on the
1404         // same side at some point earlier?
1405         std::swap(ConstantReg, TestReg);
1406         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1407       }
1408       if (VRegAndVal) {
1409         if (HasZext)
1410           C = VRegAndVal->Value.getZExtValue();
1411         else
1412           C = VRegAndVal->Value.getSExtValue();
1413       }
1414       break;
1415     }
1416     case TargetOpcode::G_ASHR:
1417     case TargetOpcode::G_LSHR:
1418     case TargetOpcode::G_SHL: {
1419       TestReg = MI->getOperand(1).getReg();
1420       auto VRegAndVal =
1421           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1422       if (VRegAndVal)
1423         C = VRegAndVal->Value.getSExtValue();
1424       break;
1425     }
1426     }
1427 
1428     // Didn't find a constant or viable register. Bail out of the loop.
1429     if (!C || !TestReg.isValid())
1430       break;
1431 
1432     // We found a suitable instruction with a constant. Check to see if we can
1433     // walk through the instruction.
1434     Register NextReg;
1435     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1436     switch (Opc) {
1437     default:
1438       break;
1439     case TargetOpcode::G_AND:
1440       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1441       if ((*C >> Bit) & 1)
1442         NextReg = TestReg;
1443       break;
1444     case TargetOpcode::G_SHL:
1445       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1446       // the type of the register.
1447       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1448         NextReg = TestReg;
1449         Bit = Bit - *C;
1450       }
1451       break;
1452     case TargetOpcode::G_ASHR:
1453       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1454       // in x
1455       NextReg = TestReg;
1456       Bit = Bit + *C;
1457       if (Bit >= TestRegSize)
1458         Bit = TestRegSize - 1;
1459       break;
1460     case TargetOpcode::G_LSHR:
1461       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1462       if ((Bit + *C) < TestRegSize) {
1463         NextReg = TestReg;
1464         Bit = Bit + *C;
1465       }
1466       break;
1467     case TargetOpcode::G_XOR:
1468       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1469       // appropriate.
1470       //
1471       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1472       //
1473       // tbz x', b -> tbnz x, b
1474       //
1475       // Because x' only has the b-th bit set if x does not.
1476       if ((*C >> Bit) & 1)
1477         Invert = !Invert;
1478       NextReg = TestReg;
1479       break;
1480     }
1481 
1482     // Check if we found anything worth folding.
1483     if (!NextReg.isValid())
1484       return Reg;
1485     Reg = NextReg;
1486   }
1487 
1488   return Reg;
1489 }
1490 
1491 MachineInstr *AArch64InstructionSelector::emitTestBit(
1492     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1493     MachineIRBuilder &MIB) const {
1494   assert(TestReg.isValid());
1495   assert(ProduceNonFlagSettingCondBr &&
1496          "Cannot emit TB(N)Z with speculation tracking!");
1497   MachineRegisterInfo &MRI = *MIB.getMRI();
1498 
1499   // Attempt to optimize the test bit by walking over instructions.
1500   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1501   LLT Ty = MRI.getType(TestReg);
1502   unsigned Size = Ty.getSizeInBits();
1503   assert(!Ty.isVector() && "Expected a scalar!");
1504   assert(Bit < 64 && "Bit is too large!");
1505 
1506   // When the test register is a 64-bit register, we have to narrow to make
1507   // TBNZW work.
1508   bool UseWReg = Bit < 32;
1509   unsigned NecessarySize = UseWReg ? 32 : 64;
1510   if (Size != NecessarySize)
1511     TestReg = moveScalarRegClass(
1512         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1513         MIB);
1514 
1515   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1516                                           {AArch64::TBZW, AArch64::TBNZW}};
1517   unsigned Opc = OpcTable[UseWReg][IsNegative];
1518   auto TestBitMI =
1519       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1520   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1521   return &*TestBitMI;
1522 }
1523 
1524 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1525     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1526     MachineIRBuilder &MIB) const {
1527   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1528   // Given something like this:
1529   //
1530   //  %x = ...Something...
1531   //  %one = G_CONSTANT i64 1
1532   //  %zero = G_CONSTANT i64 0
1533   //  %and = G_AND %x, %one
1534   //  %cmp = G_ICMP intpred(ne), %and, %zero
1535   //  %cmp_trunc = G_TRUNC %cmp
1536   //  G_BRCOND %cmp_trunc, %bb.3
1537   //
1538   // We want to try and fold the AND into the G_BRCOND and produce either a
1539   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1540   //
1541   // In this case, we'd get
1542   //
1543   // TBNZ %x %bb.3
1544   //
1545 
1546   // Check if the AND has a constant on its RHS which we can use as a mask.
1547   // If it's a power of 2, then it's the same as checking a specific bit.
1548   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1549   auto MaybeBit = getIConstantVRegValWithLookThrough(
1550       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1551   if (!MaybeBit)
1552     return false;
1553 
1554   int32_t Bit = MaybeBit->Value.exactLogBase2();
1555   if (Bit < 0)
1556     return false;
1557 
1558   Register TestReg = AndInst.getOperand(1).getReg();
1559 
1560   // Emit a TB(N)Z.
1561   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1562   return true;
1563 }
1564 
1565 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1566                                                   bool IsNegative,
1567                                                   MachineBasicBlock *DestMBB,
1568                                                   MachineIRBuilder &MIB) const {
1569   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1570   MachineRegisterInfo &MRI = *MIB.getMRI();
1571   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1572              AArch64::GPRRegBankID &&
1573          "Expected GPRs only?");
1574   auto Ty = MRI.getType(CompareReg);
1575   unsigned Width = Ty.getSizeInBits();
1576   assert(!Ty.isVector() && "Expected scalar only?");
1577   assert(Width <= 64 && "Expected width to be at most 64?");
1578   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1579                                           {AArch64::CBNZW, AArch64::CBNZX}};
1580   unsigned Opc = OpcTable[IsNegative][Width == 64];
1581   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1582   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1583   return &*BranchMI;
1584 }
1585 
1586 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1587     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1588   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1589   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1590   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1591   // totally clean.  Some of them require two branches to implement.
1592   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1593   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1594                 Pred);
1595   AArch64CC::CondCode CC1, CC2;
1596   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1597   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1598   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1599   if (CC2 != AArch64CC::AL)
1600     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1601   I.eraseFromParent();
1602   return true;
1603 }
1604 
1605 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1606     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1607   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1608   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1609   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1610   //
1611   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1612   // instructions will not be produced, as they are conditional branch
1613   // instructions that do not set flags.
1614   if (!ProduceNonFlagSettingCondBr)
1615     return false;
1616 
1617   MachineRegisterInfo &MRI = *MIB.getMRI();
1618   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1619   auto Pred =
1620       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1621   Register LHS = ICmp.getOperand(2).getReg();
1622   Register RHS = ICmp.getOperand(3).getReg();
1623 
1624   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1625   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1626   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1627 
1628   // When we can emit a TB(N)Z, prefer that.
1629   //
1630   // Handle non-commutative condition codes first.
1631   // Note that we don't want to do this when we have a G_AND because it can
1632   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1633   if (VRegAndVal && !AndInst) {
1634     int64_t C = VRegAndVal->Value.getSExtValue();
1635 
1636     // When we have a greater-than comparison, we can just test if the msb is
1637     // zero.
1638     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1639       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1640       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1641       I.eraseFromParent();
1642       return true;
1643     }
1644 
1645     // When we have a less than comparison, we can just test if the msb is not
1646     // zero.
1647     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1648       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1649       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1650       I.eraseFromParent();
1651       return true;
1652     }
1653   }
1654 
1655   // Attempt to handle commutative condition codes. Right now, that's only
1656   // eq/ne.
1657   if (ICmpInst::isEquality(Pred)) {
1658     if (!VRegAndVal) {
1659       std::swap(RHS, LHS);
1660       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1661       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1662     }
1663 
1664     if (VRegAndVal && VRegAndVal->Value == 0) {
1665       // If there's a G_AND feeding into this branch, try to fold it away by
1666       // emitting a TB(N)Z instead.
1667       //
1668       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1669       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1670       // would be redundant.
1671       if (AndInst &&
1672           tryOptAndIntoCompareBranch(
1673               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1674         I.eraseFromParent();
1675         return true;
1676       }
1677 
1678       // Otherwise, try to emit a CB(N)Z instead.
1679       auto LHSTy = MRI.getType(LHS);
1680       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1681         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1682         I.eraseFromParent();
1683         return true;
1684       }
1685     }
1686   }
1687 
1688   return false;
1689 }
1690 
1691 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1692     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1693   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1694   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1695   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1696     return true;
1697 
1698   // Couldn't optimize. Emit a compare + a Bcc.
1699   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1700   auto PredOp = ICmp.getOperand(1);
1701   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1702   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1703       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1704   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1705   I.eraseFromParent();
1706   return true;
1707 }
1708 
1709 bool AArch64InstructionSelector::selectCompareBranch(
1710     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1711   Register CondReg = I.getOperand(0).getReg();
1712   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1713   // Try to select the G_BRCOND using whatever is feeding the condition if
1714   // possible.
1715   unsigned CCMIOpc = CCMI->getOpcode();
1716   if (CCMIOpc == TargetOpcode::G_FCMP)
1717     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1718   if (CCMIOpc == TargetOpcode::G_ICMP)
1719     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1720 
1721   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1722   // instructions will not be produced, as they are conditional branch
1723   // instructions that do not set flags.
1724   if (ProduceNonFlagSettingCondBr) {
1725     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1726                 I.getOperand(1).getMBB(), MIB);
1727     I.eraseFromParent();
1728     return true;
1729   }
1730 
1731   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1732   auto TstMI =
1733       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1734   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1735   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1736                  .addImm(AArch64CC::EQ)
1737                  .addMBB(I.getOperand(1).getMBB());
1738   I.eraseFromParent();
1739   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1740 }
1741 
1742 /// Returns the element immediate value of a vector shift operand if found.
1743 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
1744 static Optional<int64_t> getVectorShiftImm(Register Reg,
1745                                            MachineRegisterInfo &MRI) {
1746   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1747   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1748   return getAArch64VectorSplatScalar(*OpMI, MRI);
1749 }
1750 
1751 /// Matches and returns the shift immediate value for a SHL instruction given
1752 /// a shift operand.
1753 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1754   Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1755   if (!ShiftImm)
1756     return None;
1757   // Check the immediate is in range for a SHL.
1758   int64_t Imm = *ShiftImm;
1759   if (Imm < 0)
1760     return None;
1761   switch (SrcTy.getElementType().getSizeInBits()) {
1762   default:
1763     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1764     return None;
1765   case 8:
1766     if (Imm > 7)
1767       return None;
1768     break;
1769   case 16:
1770     if (Imm > 15)
1771       return None;
1772     break;
1773   case 32:
1774     if (Imm > 31)
1775       return None;
1776     break;
1777   case 64:
1778     if (Imm > 63)
1779       return None;
1780     break;
1781   }
1782   return Imm;
1783 }
1784 
1785 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1786                                                  MachineRegisterInfo &MRI) {
1787   assert(I.getOpcode() == TargetOpcode::G_SHL);
1788   Register DstReg = I.getOperand(0).getReg();
1789   const LLT Ty = MRI.getType(DstReg);
1790   Register Src1Reg = I.getOperand(1).getReg();
1791   Register Src2Reg = I.getOperand(2).getReg();
1792 
1793   if (!Ty.isVector())
1794     return false;
1795 
1796   // Check if we have a vector of constants on RHS that we can select as the
1797   // immediate form.
1798   Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1799 
1800   unsigned Opc = 0;
1801   if (Ty == LLT::fixed_vector(2, 64)) {
1802     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1803   } else if (Ty == LLT::fixed_vector(4, 32)) {
1804     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1805   } else if (Ty == LLT::fixed_vector(2, 32)) {
1806     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1807   } else if (Ty == LLT::fixed_vector(4, 16)) {
1808     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1809   } else if (Ty == LLT::fixed_vector(8, 16)) {
1810     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1811   } else if (Ty == LLT::fixed_vector(16, 8)) {
1812     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1813   } else if (Ty == LLT::fixed_vector(8, 8)) {
1814     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1815   } else {
1816     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1817     return false;
1818   }
1819 
1820   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1821   if (ImmVal)
1822     Shl.addImm(*ImmVal);
1823   else
1824     Shl.addUse(Src2Reg);
1825   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1826   I.eraseFromParent();
1827   return true;
1828 }
1829 
1830 bool AArch64InstructionSelector::selectVectorAshrLshr(
1831     MachineInstr &I, MachineRegisterInfo &MRI) {
1832   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1833          I.getOpcode() == TargetOpcode::G_LSHR);
1834   Register DstReg = I.getOperand(0).getReg();
1835   const LLT Ty = MRI.getType(DstReg);
1836   Register Src1Reg = I.getOperand(1).getReg();
1837   Register Src2Reg = I.getOperand(2).getReg();
1838 
1839   if (!Ty.isVector())
1840     return false;
1841 
1842   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1843 
1844   // We expect the immediate case to be lowered in the PostLegalCombiner to
1845   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1846 
1847   // There is not a shift right register instruction, but the shift left
1848   // register instruction takes a signed value, where negative numbers specify a
1849   // right shift.
1850 
1851   unsigned Opc = 0;
1852   unsigned NegOpc = 0;
1853   const TargetRegisterClass *RC =
1854       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID));
1855   if (Ty == LLT::fixed_vector(2, 64)) {
1856     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1857     NegOpc = AArch64::NEGv2i64;
1858   } else if (Ty == LLT::fixed_vector(4, 32)) {
1859     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1860     NegOpc = AArch64::NEGv4i32;
1861   } else if (Ty == LLT::fixed_vector(2, 32)) {
1862     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1863     NegOpc = AArch64::NEGv2i32;
1864   } else if (Ty == LLT::fixed_vector(4, 16)) {
1865     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1866     NegOpc = AArch64::NEGv4i16;
1867   } else if (Ty == LLT::fixed_vector(8, 16)) {
1868     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1869     NegOpc = AArch64::NEGv8i16;
1870   } else if (Ty == LLT::fixed_vector(16, 8)) {
1871     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1872     NegOpc = AArch64::NEGv16i8;
1873   } else if (Ty == LLT::fixed_vector(8, 8)) {
1874     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1875     NegOpc = AArch64::NEGv8i8;
1876   } else {
1877     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1878     return false;
1879   }
1880 
1881   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1882   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1883   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1884   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1885   I.eraseFromParent();
1886   return true;
1887 }
1888 
1889 bool AArch64InstructionSelector::selectVaStartAAPCS(
1890     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1891   return false;
1892 }
1893 
1894 bool AArch64InstructionSelector::selectVaStartDarwin(
1895     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1896   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1897   Register ListReg = I.getOperand(0).getReg();
1898 
1899   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1900 
1901   auto MIB =
1902       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1903           .addDef(ArgsAddrReg)
1904           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1905           .addImm(0)
1906           .addImm(0);
1907 
1908   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1909 
1910   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1911             .addUse(ArgsAddrReg)
1912             .addUse(ListReg)
1913             .addImm(0)
1914             .addMemOperand(*I.memoperands_begin());
1915 
1916   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1917   I.eraseFromParent();
1918   return true;
1919 }
1920 
1921 void AArch64InstructionSelector::materializeLargeCMVal(
1922     MachineInstr &I, const Value *V, unsigned OpFlags) {
1923   MachineBasicBlock &MBB = *I.getParent();
1924   MachineFunction &MF = *MBB.getParent();
1925   MachineRegisterInfo &MRI = MF.getRegInfo();
1926 
1927   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1928   MovZ->addOperand(MF, I.getOperand(1));
1929   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1930                                      AArch64II::MO_NC);
1931   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1932   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1933 
1934   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1935                        Register ForceDstReg) {
1936     Register DstReg = ForceDstReg
1937                           ? ForceDstReg
1938                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1939     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1940     if (auto *GV = dyn_cast<GlobalValue>(V)) {
1941       MovI->addOperand(MF, MachineOperand::CreateGA(
1942                                GV, MovZ->getOperand(1).getOffset(), Flags));
1943     } else {
1944       MovI->addOperand(
1945           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1946                                        MovZ->getOperand(1).getOffset(), Flags));
1947     }
1948     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1949     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1950     return DstReg;
1951   };
1952   Register DstReg = BuildMovK(MovZ.getReg(0),
1953                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1954   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1955   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1956 }
1957 
1958 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1959   MachineBasicBlock &MBB = *I.getParent();
1960   MachineFunction &MF = *MBB.getParent();
1961   MachineRegisterInfo &MRI = MF.getRegInfo();
1962 
1963   switch (I.getOpcode()) {
1964   case TargetOpcode::G_STORE: {
1965     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1966     MachineOperand &SrcOp = I.getOperand(0);
1967     if (MRI.getType(SrcOp.getReg()).isPointer()) {
1968       // Allow matching with imported patterns for stores of pointers. Unlike
1969       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1970       // and constrain.
1971       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1972       Register NewSrc = Copy.getReg(0);
1973       SrcOp.setReg(NewSrc);
1974       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1975       Changed = true;
1976     }
1977     return Changed;
1978   }
1979   case TargetOpcode::G_PTR_ADD:
1980     return convertPtrAddToAdd(I, MRI);
1981   case TargetOpcode::G_LOAD: {
1982     // For scalar loads of pointers, we try to convert the dest type from p0
1983     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1984     // conversion, this should be ok because all users should have been
1985     // selected already, so the type doesn't matter for them.
1986     Register DstReg = I.getOperand(0).getReg();
1987     const LLT DstTy = MRI.getType(DstReg);
1988     if (!DstTy.isPointer())
1989       return false;
1990     MRI.setType(DstReg, LLT::scalar(64));
1991     return true;
1992   }
1993   case AArch64::G_DUP: {
1994     // Convert the type from p0 to s64 to help selection.
1995     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1996     if (!DstTy.getElementType().isPointer())
1997       return false;
1998     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1999     MRI.setType(I.getOperand(0).getReg(),
2000                 DstTy.changeElementType(LLT::scalar(64)));
2001     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
2002     I.getOperand(1).setReg(NewSrc.getReg(0));
2003     return true;
2004   }
2005   case TargetOpcode::G_UITOFP:
2006   case TargetOpcode::G_SITOFP: {
2007     // If both source and destination regbanks are FPR, then convert the opcode
2008     // to G_SITOF so that the importer can select it to an fpr variant.
2009     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2010     // copy.
2011     Register SrcReg = I.getOperand(1).getReg();
2012     LLT SrcTy = MRI.getType(SrcReg);
2013     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2014     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2015       return false;
2016 
2017     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2018       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2019         I.setDesc(TII.get(AArch64::G_SITOF));
2020       else
2021         I.setDesc(TII.get(AArch64::G_UITOF));
2022       return true;
2023     }
2024     return false;
2025   }
2026   default:
2027     return false;
2028   }
2029 }
2030 
2031 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2032 /// them to a standard G_ADD with a COPY on the source.
2033 ///
2034 /// The motivation behind this is to expose the add semantics to the imported
2035 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2036 /// because the selector works bottom up, uses before defs. By the time we
2037 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2038 /// fold this into addressing modes and were therefore unsuccessful.
2039 bool AArch64InstructionSelector::convertPtrAddToAdd(
2040     MachineInstr &I, MachineRegisterInfo &MRI) {
2041   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2042   Register DstReg = I.getOperand(0).getReg();
2043   Register AddOp1Reg = I.getOperand(1).getReg();
2044   const LLT PtrTy = MRI.getType(DstReg);
2045   if (PtrTy.getAddressSpace() != 0)
2046     return false;
2047 
2048   const LLT CastPtrTy =
2049       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2050   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2051   // Set regbanks on the registers.
2052   if (PtrTy.isVector())
2053     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2054   else
2055     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2056 
2057   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2058   // %dst(intty) = G_ADD %intbase, off
2059   I.setDesc(TII.get(TargetOpcode::G_ADD));
2060   MRI.setType(DstReg, CastPtrTy);
2061   I.getOperand(1).setReg(PtrToInt.getReg(0));
2062   if (!select(*PtrToInt)) {
2063     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2064     return false;
2065   }
2066 
2067   // Also take the opportunity here to try to do some optimization.
2068   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2069   Register NegatedReg;
2070   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2071     return true;
2072   I.getOperand(2).setReg(NegatedReg);
2073   I.setDesc(TII.get(TargetOpcode::G_SUB));
2074   return true;
2075 }
2076 
2077 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2078                                                 MachineRegisterInfo &MRI) {
2079   // We try to match the immediate variant of LSL, which is actually an alias
2080   // for a special case of UBFM. Otherwise, we fall back to the imported
2081   // selector which will match the register variant.
2082   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2083   const auto &MO = I.getOperand(2);
2084   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2085   if (!VRegAndVal)
2086     return false;
2087 
2088   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2089   if (DstTy.isVector())
2090     return false;
2091   bool Is64Bit = DstTy.getSizeInBits() == 64;
2092   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2093   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2094 
2095   if (!Imm1Fn || !Imm2Fn)
2096     return false;
2097 
2098   auto NewI =
2099       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2100                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2101 
2102   for (auto &RenderFn : *Imm1Fn)
2103     RenderFn(NewI);
2104   for (auto &RenderFn : *Imm2Fn)
2105     RenderFn(NewI);
2106 
2107   I.eraseFromParent();
2108   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2109 }
2110 
2111 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2112     MachineInstr &I, MachineRegisterInfo &MRI) {
2113   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2114   // If we're storing a scalar, it doesn't matter what register bank that
2115   // scalar is on. All that matters is the size.
2116   //
2117   // So, if we see something like this (with a 32-bit scalar as an example):
2118   //
2119   // %x:gpr(s32) = ... something ...
2120   // %y:fpr(s32) = COPY %x:gpr(s32)
2121   // G_STORE %y:fpr(s32)
2122   //
2123   // We can fix this up into something like this:
2124   //
2125   // G_STORE %x:gpr(s32)
2126   //
2127   // And then continue the selection process normally.
2128   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2129   if (!DefDstReg.isValid())
2130     return false;
2131   LLT DefDstTy = MRI.getType(DefDstReg);
2132   Register StoreSrcReg = I.getOperand(0).getReg();
2133   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2134 
2135   // If we get something strange like a physical register, then we shouldn't
2136   // go any further.
2137   if (!DefDstTy.isValid())
2138     return false;
2139 
2140   // Are the source and dst types the same size?
2141   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2142     return false;
2143 
2144   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2145       RBI.getRegBank(DefDstReg, MRI, TRI))
2146     return false;
2147 
2148   // We have a cross-bank copy, which is entering a store. Let's fold it.
2149   I.getOperand(0).setReg(DefDstReg);
2150   return true;
2151 }
2152 
2153 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2154   assert(I.getParent() && "Instruction should be in a basic block!");
2155   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2156 
2157   MachineBasicBlock &MBB = *I.getParent();
2158   MachineFunction &MF = *MBB.getParent();
2159   MachineRegisterInfo &MRI = MF.getRegInfo();
2160 
2161   switch (I.getOpcode()) {
2162   case AArch64::G_DUP: {
2163     // Before selecting a DUP instruction, check if it is better selected as a
2164     // MOV or load from a constant pool.
2165     Register Src = I.getOperand(1).getReg();
2166     auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
2167     if (!ValAndVReg)
2168       return false;
2169     LLVMContext &Ctx = MF.getFunction().getContext();
2170     Register Dst = I.getOperand(0).getReg();
2171     auto *CV = ConstantDataVector::getSplat(
2172         MRI.getType(Dst).getNumElements(),
2173         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2174                          ValAndVReg->Value));
2175     if (!emitConstantVector(Dst, CV, MIB, MRI))
2176       return false;
2177     I.eraseFromParent();
2178     return true;
2179   }
2180   case TargetOpcode::G_SEXT:
2181     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2182     // over a normal extend.
2183     if (selectUSMovFromExtend(I, MRI))
2184       return true;
2185     return false;
2186   case TargetOpcode::G_BR:
2187     return false;
2188   case TargetOpcode::G_SHL:
2189     return earlySelectSHL(I, MRI);
2190   case TargetOpcode::G_CONSTANT: {
2191     bool IsZero = false;
2192     if (I.getOperand(1).isCImm())
2193       IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2194     else if (I.getOperand(1).isImm())
2195       IsZero = I.getOperand(1).getImm() == 0;
2196 
2197     if (!IsZero)
2198       return false;
2199 
2200     Register DefReg = I.getOperand(0).getReg();
2201     LLT Ty = MRI.getType(DefReg);
2202     if (Ty.getSizeInBits() == 64) {
2203       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2204       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2205     } else if (Ty.getSizeInBits() == 32) {
2206       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2207       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2208     } else
2209       return false;
2210 
2211     I.setDesc(TII.get(TargetOpcode::COPY));
2212     return true;
2213   }
2214 
2215   case TargetOpcode::G_ADD: {
2216     // Check if this is being fed by a G_ICMP on either side.
2217     //
2218     // (cmp pred, x, y) + z
2219     //
2220     // In the above case, when the cmp is true, we increment z by 1. So, we can
2221     // fold the add into the cset for the cmp by using cinc.
2222     //
2223     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2224     Register AddDst = I.getOperand(0).getReg();
2225     Register AddLHS = I.getOperand(1).getReg();
2226     Register AddRHS = I.getOperand(2).getReg();
2227     // Only handle scalars.
2228     LLT Ty = MRI.getType(AddLHS);
2229     if (Ty.isVector())
2230       return false;
2231     // Since G_ICMP is modeled as ADDS/SUBS/ANDS, we can handle 32 bits or 64
2232     // bits.
2233     unsigned Size = Ty.getSizeInBits();
2234     if (Size != 32 && Size != 64)
2235       return false;
2236     auto MatchCmp = [&](Register Reg) -> MachineInstr * {
2237       if (!MRI.hasOneNonDBGUse(Reg))
2238         return nullptr;
2239       // If the LHS of the add is 32 bits, then we want to fold a 32-bit
2240       // compare.
2241       if (Size == 32)
2242         return getOpcodeDef(TargetOpcode::G_ICMP, Reg, MRI);
2243       // We model scalar compares using 32-bit destinations right now.
2244       // If it's a 64-bit compare, it'll have 64-bit sources.
2245       Register ZExt;
2246       if (!mi_match(Reg, MRI,
2247                     m_OneNonDBGUse(m_GZExt(m_OneNonDBGUse(m_Reg(ZExt))))))
2248         return nullptr;
2249       auto *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, ZExt, MRI);
2250       if (!Cmp ||
2251           MRI.getType(Cmp->getOperand(2).getReg()).getSizeInBits() != 64)
2252         return nullptr;
2253       return Cmp;
2254     };
2255     // Try to match
2256     // z + (cmp pred, x, y)
2257     MachineInstr *Cmp = MatchCmp(AddRHS);
2258     if (!Cmp) {
2259       // (cmp pred, x, y) + z
2260       std::swap(AddLHS, AddRHS);
2261       Cmp = MatchCmp(AddRHS);
2262       if (!Cmp)
2263         return false;
2264     }
2265     auto &PredOp = Cmp->getOperand(1);
2266     auto Pred = static_cast<CmpInst::Predicate>(PredOp.getPredicate());
2267     const AArch64CC::CondCode InvCC =
2268         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
2269     MIB.setInstrAndDebugLoc(I);
2270     emitIntegerCompare(/*LHS=*/Cmp->getOperand(2),
2271                        /*RHS=*/Cmp->getOperand(3), PredOp, MIB);
2272     emitCSINC(/*Dst=*/AddDst, /*Src =*/AddLHS, /*Src2=*/AddLHS, InvCC, MIB);
2273     I.eraseFromParent();
2274     return true;
2275   }
2276   case TargetOpcode::G_OR: {
2277     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2278     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2279     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2280     Register Dst = I.getOperand(0).getReg();
2281     LLT Ty = MRI.getType(Dst);
2282 
2283     if (!Ty.isScalar())
2284       return false;
2285 
2286     unsigned Size = Ty.getSizeInBits();
2287     if (Size != 32 && Size != 64)
2288       return false;
2289 
2290     Register ShiftSrc;
2291     int64_t ShiftImm;
2292     Register MaskSrc;
2293     int64_t MaskImm;
2294     if (!mi_match(
2295             Dst, MRI,
2296             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2297                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2298       return false;
2299 
2300     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2301       return false;
2302 
2303     int64_t Immr = Size - ShiftImm;
2304     int64_t Imms = Size - ShiftImm - 1;
2305     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2306     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2307     I.eraseFromParent();
2308     return true;
2309   }
2310   case TargetOpcode::G_FENCE: {
2311     if (I.getOperand(1).getImm() == 0)
2312       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::CompilerBarrier))
2313           .addImm(I.getOperand(0).getImm());
2314     else
2315       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::DMB))
2316           .addImm(I.getOperand(0).getImm() == 4 ? 0x9 : 0xb);
2317     I.eraseFromParent();
2318     return true;
2319   }
2320   default:
2321     return false;
2322   }
2323 }
2324 
2325 bool AArch64InstructionSelector::select(MachineInstr &I) {
2326   assert(I.getParent() && "Instruction should be in a basic block!");
2327   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2328 
2329   MachineBasicBlock &MBB = *I.getParent();
2330   MachineFunction &MF = *MBB.getParent();
2331   MachineRegisterInfo &MRI = MF.getRegInfo();
2332 
2333   const AArch64Subtarget *Subtarget = &MF.getSubtarget<AArch64Subtarget>();
2334   if (Subtarget->requiresStrictAlign()) {
2335     // We don't support this feature yet.
2336     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2337     return false;
2338   }
2339 
2340   MIB.setInstrAndDebugLoc(I);
2341 
2342   unsigned Opcode = I.getOpcode();
2343   // G_PHI requires same handling as PHI
2344   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2345     // Certain non-generic instructions also need some special handling.
2346 
2347     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2348       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2349 
2350     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2351       const Register DefReg = I.getOperand(0).getReg();
2352       const LLT DefTy = MRI.getType(DefReg);
2353 
2354       const RegClassOrRegBank &RegClassOrBank =
2355         MRI.getRegClassOrRegBank(DefReg);
2356 
2357       const TargetRegisterClass *DefRC
2358         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2359       if (!DefRC) {
2360         if (!DefTy.isValid()) {
2361           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2362           return false;
2363         }
2364         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2365         DefRC = getRegClassForTypeOnBank(DefTy, RB);
2366         if (!DefRC) {
2367           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2368           return false;
2369         }
2370       }
2371 
2372       I.setDesc(TII.get(TargetOpcode::PHI));
2373 
2374       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2375     }
2376 
2377     if (I.isCopy())
2378       return selectCopy(I, TII, MRI, TRI, RBI);
2379 
2380     return true;
2381   }
2382 
2383 
2384   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2385     LLVM_DEBUG(
2386         dbgs() << "Generic instruction has unexpected implicit operands\n");
2387     return false;
2388   }
2389 
2390   // Try to do some lowering before we start instruction selecting. These
2391   // lowerings are purely transformations on the input G_MIR and so selection
2392   // must continue after any modification of the instruction.
2393   if (preISelLower(I)) {
2394     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2395   }
2396 
2397   // There may be patterns where the importer can't deal with them optimally,
2398   // but does select it to a suboptimal sequence so our custom C++ selection
2399   // code later never has a chance to work on it. Therefore, we have an early
2400   // selection attempt here to give priority to certain selection routines
2401   // over the imported ones.
2402   if (earlySelect(I))
2403     return true;
2404 
2405   if (selectImpl(I, *CoverageInfo))
2406     return true;
2407 
2408   LLT Ty =
2409       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2410 
2411   switch (Opcode) {
2412   case TargetOpcode::G_SBFX:
2413   case TargetOpcode::G_UBFX: {
2414     static const unsigned OpcTable[2][2] = {
2415         {AArch64::UBFMWri, AArch64::UBFMXri},
2416         {AArch64::SBFMWri, AArch64::SBFMXri}};
2417     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2418     unsigned Size = Ty.getSizeInBits();
2419     unsigned Opc = OpcTable[IsSigned][Size == 64];
2420     auto Cst1 =
2421         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2422     assert(Cst1 && "Should have gotten a constant for src 1?");
2423     auto Cst2 =
2424         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2425     assert(Cst2 && "Should have gotten a constant for src 2?");
2426     auto LSB = Cst1->Value.getZExtValue();
2427     auto Width = Cst2->Value.getZExtValue();
2428     auto BitfieldInst =
2429         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2430             .addImm(LSB)
2431             .addImm(LSB + Width - 1);
2432     I.eraseFromParent();
2433     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2434   }
2435   case TargetOpcode::G_BRCOND:
2436     return selectCompareBranch(I, MF, MRI);
2437 
2438   case TargetOpcode::G_BRINDIRECT: {
2439     I.setDesc(TII.get(AArch64::BR));
2440     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2441   }
2442 
2443   case TargetOpcode::G_BRJT:
2444     return selectBrJT(I, MRI);
2445 
2446   case AArch64::G_ADD_LOW: {
2447     // This op may have been separated from it's ADRP companion by the localizer
2448     // or some other code motion pass. Given that many CPUs will try to
2449     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2450     // which will later be expanded into an ADRP+ADD pair after scheduling.
2451     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2452     if (BaseMI->getOpcode() != AArch64::ADRP) {
2453       I.setDesc(TII.get(AArch64::ADDXri));
2454       I.addOperand(MachineOperand::CreateImm(0));
2455       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2456     }
2457     assert(TM.getCodeModel() == CodeModel::Small &&
2458            "Expected small code model");
2459     auto Op1 = BaseMI->getOperand(1);
2460     auto Op2 = I.getOperand(2);
2461     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2462                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2463                                          Op1.getTargetFlags())
2464                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2465                                          Op2.getTargetFlags());
2466     I.eraseFromParent();
2467     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2468   }
2469 
2470   case TargetOpcode::G_BSWAP: {
2471     // Handle vector types for G_BSWAP directly.
2472     Register DstReg = I.getOperand(0).getReg();
2473     LLT DstTy = MRI.getType(DstReg);
2474 
2475     // We should only get vector types here; everything else is handled by the
2476     // importer right now.
2477     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2478       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2479       return false;
2480     }
2481 
2482     // Only handle 4 and 2 element vectors for now.
2483     // TODO: 16-bit elements.
2484     unsigned NumElts = DstTy.getNumElements();
2485     if (NumElts != 4 && NumElts != 2) {
2486       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2487       return false;
2488     }
2489 
2490     // Choose the correct opcode for the supported types. Right now, that's
2491     // v2s32, v4s32, and v2s64.
2492     unsigned Opc = 0;
2493     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2494     if (EltSize == 32)
2495       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2496                                           : AArch64::REV32v16i8;
2497     else if (EltSize == 64)
2498       Opc = AArch64::REV64v16i8;
2499 
2500     // We should always get something by the time we get here...
2501     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2502 
2503     I.setDesc(TII.get(Opc));
2504     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2505   }
2506 
2507   case TargetOpcode::G_FCONSTANT:
2508   case TargetOpcode::G_CONSTANT: {
2509     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2510 
2511     const LLT s8 = LLT::scalar(8);
2512     const LLT s16 = LLT::scalar(16);
2513     const LLT s32 = LLT::scalar(32);
2514     const LLT s64 = LLT::scalar(64);
2515     const LLT s128 = LLT::scalar(128);
2516     const LLT p0 = LLT::pointer(0, 64);
2517 
2518     const Register DefReg = I.getOperand(0).getReg();
2519     const LLT DefTy = MRI.getType(DefReg);
2520     const unsigned DefSize = DefTy.getSizeInBits();
2521     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2522 
2523     // FIXME: Redundant check, but even less readable when factored out.
2524     if (isFP) {
2525       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2526         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2527                           << " constant, expected: " << s16 << " or " << s32
2528                           << " or " << s64 << " or " << s128 << '\n');
2529         return false;
2530       }
2531 
2532       if (RB.getID() != AArch64::FPRRegBankID) {
2533         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2534                           << " constant on bank: " << RB
2535                           << ", expected: FPR\n");
2536         return false;
2537       }
2538 
2539       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2540       // can be sure tablegen works correctly and isn't rescued by this code.
2541       // 0.0 is not covered by tablegen for FP128. So we will handle this
2542       // scenario in the code here.
2543       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2544         return false;
2545     } else {
2546       // s32 and s64 are covered by tablegen.
2547       if (Ty != p0 && Ty != s8 && Ty != s16) {
2548         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2549                           << " constant, expected: " << s32 << ", " << s64
2550                           << ", or " << p0 << '\n');
2551         return false;
2552       }
2553 
2554       if (RB.getID() != AArch64::GPRRegBankID) {
2555         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2556                           << " constant on bank: " << RB
2557                           << ", expected: GPR\n");
2558         return false;
2559       }
2560     }
2561 
2562     if (isFP) {
2563       const TargetRegisterClass &FPRRC = *getRegClassForTypeOnBank(DefTy, RB);
2564       // For 16, 64, and 128b values, emit a constant pool load.
2565       switch (DefSize) {
2566       default:
2567         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2568       case 32:
2569         // For s32, use a cp load if we have optsize/minsize.
2570         if (!shouldOptForSize(&MF))
2571           break;
2572         LLVM_FALLTHROUGH;
2573       case 16:
2574       case 64:
2575       case 128: {
2576         auto *FPImm = I.getOperand(1).getFPImm();
2577         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2578         if (!LoadMI) {
2579           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2580           return false;
2581         }
2582         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2583         I.eraseFromParent();
2584         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2585       }
2586       }
2587 
2588       // Either emit a FMOV, or emit a copy to emit a normal mov.
2589       assert(DefSize == 32 &&
2590              "Expected constant pool loads for all sizes other than 32!");
2591       const Register DefGPRReg =
2592           MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2593       MachineOperand &RegOp = I.getOperand(0);
2594       RegOp.setReg(DefGPRReg);
2595       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2596       MIB.buildCopy({DefReg}, {DefGPRReg});
2597 
2598       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2599         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2600         return false;
2601       }
2602 
2603       MachineOperand &ImmOp = I.getOperand(1);
2604       // FIXME: Is going through int64_t always correct?
2605       ImmOp.ChangeToImmediate(
2606           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2607     } else if (I.getOperand(1).isCImm()) {
2608       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2609       I.getOperand(1).ChangeToImmediate(Val);
2610     } else if (I.getOperand(1).isImm()) {
2611       uint64_t Val = I.getOperand(1).getImm();
2612       I.getOperand(1).ChangeToImmediate(Val);
2613     }
2614 
2615     const unsigned MovOpc =
2616         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2617     I.setDesc(TII.get(MovOpc));
2618     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2619     return true;
2620   }
2621   case TargetOpcode::G_EXTRACT: {
2622     Register DstReg = I.getOperand(0).getReg();
2623     Register SrcReg = I.getOperand(1).getReg();
2624     LLT SrcTy = MRI.getType(SrcReg);
2625     LLT DstTy = MRI.getType(DstReg);
2626     (void)DstTy;
2627     unsigned SrcSize = SrcTy.getSizeInBits();
2628 
2629     if (SrcTy.getSizeInBits() > 64) {
2630       // This should be an extract of an s128, which is like a vector extract.
2631       if (SrcTy.getSizeInBits() != 128)
2632         return false;
2633       // Only support extracting 64 bits from an s128 at the moment.
2634       if (DstTy.getSizeInBits() != 64)
2635         return false;
2636 
2637       unsigned Offset = I.getOperand(2).getImm();
2638       if (Offset % 64 != 0)
2639         return false;
2640 
2641       // Check we have the right regbank always.
2642       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2643       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2644       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2645 
2646       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2647         MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2648             .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2649         I.eraseFromParent();
2650         return true;
2651       }
2652 
2653       // Emit the same code as a vector extract.
2654       // Offset must be a multiple of 64.
2655       unsigned LaneIdx = Offset / 64;
2656       MachineInstr *Extract = emitExtractVectorElt(
2657           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2658       if (!Extract)
2659         return false;
2660       I.eraseFromParent();
2661       return true;
2662     }
2663 
2664     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2665     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2666                                       Ty.getSizeInBits() - 1);
2667 
2668     if (SrcSize < 64) {
2669       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2670              "unexpected G_EXTRACT types");
2671       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2672     }
2673 
2674     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2675     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2676     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2677         .addReg(DstReg, 0, AArch64::sub_32);
2678     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2679                                  AArch64::GPR32RegClass, MRI);
2680     I.getOperand(0).setReg(DstReg);
2681 
2682     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2683   }
2684 
2685   case TargetOpcode::G_INSERT: {
2686     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2687     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2688     unsigned DstSize = DstTy.getSizeInBits();
2689     // Larger inserts are vectors, same-size ones should be something else by
2690     // now (split up or turned into COPYs).
2691     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2692       return false;
2693 
2694     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2695     unsigned LSB = I.getOperand(3).getImm();
2696     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2697     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2698     MachineInstrBuilder(MF, I).addImm(Width - 1);
2699 
2700     if (DstSize < 64) {
2701       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2702              "unexpected G_INSERT types");
2703       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2704     }
2705 
2706     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2707     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2708             TII.get(AArch64::SUBREG_TO_REG))
2709         .addDef(SrcReg)
2710         .addImm(0)
2711         .addUse(I.getOperand(2).getReg())
2712         .addImm(AArch64::sub_32);
2713     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2714                                  AArch64::GPR32RegClass, MRI);
2715     I.getOperand(2).setReg(SrcReg);
2716 
2717     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2718   }
2719   case TargetOpcode::G_FRAME_INDEX: {
2720     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2721     if (Ty != LLT::pointer(0, 64)) {
2722       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2723                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2724       return false;
2725     }
2726     I.setDesc(TII.get(AArch64::ADDXri));
2727 
2728     // MOs for a #0 shifted immediate.
2729     I.addOperand(MachineOperand::CreateImm(0));
2730     I.addOperand(MachineOperand::CreateImm(0));
2731 
2732     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2733   }
2734 
2735   case TargetOpcode::G_GLOBAL_VALUE: {
2736     auto GV = I.getOperand(1).getGlobal();
2737     if (GV->isThreadLocal())
2738       return selectTLSGlobalValue(I, MRI);
2739 
2740     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2741     if (OpFlags & AArch64II::MO_GOT) {
2742       I.setDesc(TII.get(AArch64::LOADgot));
2743       I.getOperand(1).setTargetFlags(OpFlags);
2744     } else if (TM.getCodeModel() == CodeModel::Large) {
2745       // Materialize the global using movz/movk instructions.
2746       materializeLargeCMVal(I, GV, OpFlags);
2747       I.eraseFromParent();
2748       return true;
2749     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2750       I.setDesc(TII.get(AArch64::ADR));
2751       I.getOperand(1).setTargetFlags(OpFlags);
2752     } else {
2753       I.setDesc(TII.get(AArch64::MOVaddr));
2754       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2755       MachineInstrBuilder MIB(MF, I);
2756       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2757                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2758     }
2759     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2760   }
2761 
2762   case TargetOpcode::G_ZEXTLOAD:
2763   case TargetOpcode::G_LOAD:
2764   case TargetOpcode::G_STORE: {
2765     GLoadStore &LdSt = cast<GLoadStore>(I);
2766     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2767     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2768 
2769     if (PtrTy != LLT::pointer(0, 64)) {
2770       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2771                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2772       return false;
2773     }
2774 
2775     uint64_t MemSizeInBytes = LdSt.getMemSize();
2776     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2777     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2778 
2779     // Need special instructions for atomics that affect ordering.
2780     if (Order != AtomicOrdering::NotAtomic &&
2781         Order != AtomicOrdering::Unordered &&
2782         Order != AtomicOrdering::Monotonic) {
2783       assert(!isa<GZExtLoad>(LdSt));
2784       if (MemSizeInBytes > 64)
2785         return false;
2786 
2787       if (isa<GLoad>(LdSt)) {
2788         static constexpr unsigned LDAPROpcodes[] = {
2789             AArch64::LDAPRB, AArch64::LDAPRH, AArch64::LDAPRW, AArch64::LDAPRX};
2790         static constexpr unsigned LDAROpcodes[] = {
2791             AArch64::LDARB, AArch64::LDARH, AArch64::LDARW, AArch64::LDARX};
2792         ArrayRef<unsigned> Opcodes =
2793             STI.hasLDAPR() && Order != AtomicOrdering::SequentiallyConsistent
2794                 ? LDAPROpcodes
2795                 : LDAROpcodes;
2796         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2797       } else {
2798         static constexpr unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2799                                                AArch64::STLRW, AArch64::STLRX};
2800         Register ValReg = LdSt.getReg(0);
2801         if (MRI.getType(ValReg).getSizeInBits() == 64 && MemSizeInBits != 64) {
2802           // Emit a subreg copy of 32 bits.
2803           Register NewVal = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2804           MIB.buildInstr(TargetOpcode::COPY, {NewVal}, {})
2805               .addReg(I.getOperand(0).getReg(), 0, AArch64::sub_32);
2806           I.getOperand(0).setReg(NewVal);
2807         }
2808         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2809       }
2810       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2811       return true;
2812     }
2813 
2814 #ifndef NDEBUG
2815     const Register PtrReg = LdSt.getPointerReg();
2816     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2817     // Check that the pointer register is valid.
2818     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2819            "Load/Store pointer operand isn't a GPR");
2820     assert(MRI.getType(PtrReg).isPointer() &&
2821            "Load/Store pointer operand isn't a pointer");
2822 #endif
2823 
2824     const Register ValReg = LdSt.getReg(0);
2825     const LLT ValTy = MRI.getType(ValReg);
2826     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2827 
2828     // The code below doesn't support truncating stores, so we need to split it
2829     // again.
2830     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2831       unsigned SubReg;
2832       LLT MemTy = LdSt.getMMO().getMemoryType();
2833       auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2834       if (!getSubRegForClass(RC, TRI, SubReg))
2835         return false;
2836 
2837       // Generate a subreg copy.
2838       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2839                       .addReg(ValReg, 0, SubReg)
2840                       .getReg(0);
2841       RBI.constrainGenericRegister(Copy, *RC, MRI);
2842       LdSt.getOperand(0).setReg(Copy);
2843     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2844       // If this is an any-extending load from the FPR bank, split it into a regular
2845       // load + extend.
2846       if (RB.getID() == AArch64::FPRRegBankID) {
2847         unsigned SubReg;
2848         LLT MemTy = LdSt.getMMO().getMemoryType();
2849         auto *RC = getRegClassForTypeOnBank(MemTy, RB);
2850         if (!getSubRegForClass(RC, TRI, SubReg))
2851           return false;
2852         Register OldDst = LdSt.getReg(0);
2853         Register NewDst =
2854             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2855         LdSt.getOperand(0).setReg(NewDst);
2856         MRI.setRegBank(NewDst, RB);
2857         // Generate a SUBREG_TO_REG to extend it.
2858         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2859         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2860             .addImm(0)
2861             .addUse(NewDst)
2862             .addImm(SubReg);
2863         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB);
2864         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2865         MIB.setInstr(LdSt);
2866       }
2867     }
2868 
2869     // Helper lambda for partially selecting I. Either returns the original
2870     // instruction with an updated opcode, or a new instruction.
2871     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2872       bool IsStore = isa<GStore>(I);
2873       const unsigned NewOpc =
2874           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2875       if (NewOpc == I.getOpcode())
2876         return nullptr;
2877       // Check if we can fold anything into the addressing mode.
2878       auto AddrModeFns =
2879           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2880       if (!AddrModeFns) {
2881         // Can't fold anything. Use the original instruction.
2882         I.setDesc(TII.get(NewOpc));
2883         I.addOperand(MachineOperand::CreateImm(0));
2884         return &I;
2885       }
2886 
2887       // Folded something. Create a new instruction and return it.
2888       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2889       Register CurValReg = I.getOperand(0).getReg();
2890       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2891       NewInst.cloneMemRefs(I);
2892       for (auto &Fn : *AddrModeFns)
2893         Fn(NewInst);
2894       I.eraseFromParent();
2895       return &*NewInst;
2896     };
2897 
2898     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2899     if (!LoadStore)
2900       return false;
2901 
2902     // If we're storing a 0, use WZR/XZR.
2903     if (Opcode == TargetOpcode::G_STORE) {
2904       auto CVal = getIConstantVRegValWithLookThrough(
2905           LoadStore->getOperand(0).getReg(), MRI);
2906       if (CVal && CVal->Value == 0) {
2907         switch (LoadStore->getOpcode()) {
2908         case AArch64::STRWui:
2909         case AArch64::STRHHui:
2910         case AArch64::STRBBui:
2911           LoadStore->getOperand(0).setReg(AArch64::WZR);
2912           break;
2913         case AArch64::STRXui:
2914           LoadStore->getOperand(0).setReg(AArch64::XZR);
2915           break;
2916         }
2917       }
2918     }
2919 
2920     if (IsZExtLoad) {
2921       // The zextload from a smaller type to i32 should be handled by the
2922       // importer.
2923       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2924         return false;
2925       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2926       // and zero_extend with SUBREG_TO_REG.
2927       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2928       Register DstReg = LoadStore->getOperand(0).getReg();
2929       LoadStore->getOperand(0).setReg(LdReg);
2930 
2931       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2932       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2933           .addImm(0)
2934           .addUse(LdReg)
2935           .addImm(AArch64::sub_32);
2936       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2937       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2938                                           MRI);
2939     }
2940     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2941   }
2942 
2943   case TargetOpcode::G_SMULH:
2944   case TargetOpcode::G_UMULH: {
2945     // Reject the various things we don't support yet.
2946     if (unsupportedBinOp(I, RBI, MRI, TRI))
2947       return false;
2948 
2949     const Register DefReg = I.getOperand(0).getReg();
2950     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2951 
2952     if (RB.getID() != AArch64::GPRRegBankID) {
2953       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2954       return false;
2955     }
2956 
2957     if (Ty != LLT::scalar(64)) {
2958       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2959                         << ", expected: " << LLT::scalar(64) << '\n');
2960       return false;
2961     }
2962 
2963     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2964                                                              : AArch64::UMULHrr;
2965     I.setDesc(TII.get(NewOpc));
2966 
2967     // Now that we selected an opcode, we need to constrain the register
2968     // operands to use appropriate classes.
2969     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2970   }
2971   case TargetOpcode::G_LSHR:
2972   case TargetOpcode::G_ASHR:
2973     if (MRI.getType(I.getOperand(0).getReg()).isVector())
2974       return selectVectorAshrLshr(I, MRI);
2975     LLVM_FALLTHROUGH;
2976   case TargetOpcode::G_SHL:
2977     if (Opcode == TargetOpcode::G_SHL &&
2978         MRI.getType(I.getOperand(0).getReg()).isVector())
2979       return selectVectorSHL(I, MRI);
2980 
2981     // These shifts were legalized to have 64 bit shift amounts because we
2982     // want to take advantage of the selection patterns that assume the
2983     // immediates are s64s, however, selectBinaryOp will assume both operands
2984     // will have the same bit size.
2985     {
2986       Register SrcReg = I.getOperand(1).getReg();
2987       Register ShiftReg = I.getOperand(2).getReg();
2988       const LLT ShiftTy = MRI.getType(ShiftReg);
2989       const LLT SrcTy = MRI.getType(SrcReg);
2990       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 32 &&
2991           ShiftTy.getSizeInBits() == 64) {
2992         assert(!ShiftTy.isVector() && "unexpected vector shift ty");
2993         // Insert a subregister copy to implement a 64->32 trunc
2994         auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
2995                          .addReg(ShiftReg, 0, AArch64::sub_32);
2996         MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2997         I.getOperand(2).setReg(Trunc.getReg(0));
2998       }
2999     }
3000     LLVM_FALLTHROUGH;
3001   case TargetOpcode::G_OR: {
3002     // Reject the various things we don't support yet.
3003     if (unsupportedBinOp(I, RBI, MRI, TRI))
3004       return false;
3005 
3006     const unsigned OpSize = Ty.getSizeInBits();
3007 
3008     const Register DefReg = I.getOperand(0).getReg();
3009     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
3010 
3011     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
3012     if (NewOpc == I.getOpcode())
3013       return false;
3014 
3015     I.setDesc(TII.get(NewOpc));
3016     // FIXME: Should the type be always reset in setDesc?
3017 
3018     // Now that we selected an opcode, we need to constrain the register
3019     // operands to use appropriate classes.
3020     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3021   }
3022 
3023   case TargetOpcode::G_PTR_ADD: {
3024     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
3025     I.eraseFromParent();
3026     return true;
3027   }
3028   case TargetOpcode::G_SADDO:
3029   case TargetOpcode::G_UADDO:
3030   case TargetOpcode::G_SSUBO:
3031   case TargetOpcode::G_USUBO: {
3032     // Emit the operation and get the correct condition code.
3033     auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
3034                                   I.getOperand(2), I.getOperand(3), MIB);
3035 
3036     // Now, put the overflow result in the register given by the first operand
3037     // to the overflow op. CSINC increments the result when the predicate is
3038     // false, so to get the increment when it's true, we need to use the
3039     // inverse. In this case, we want to increment when carry is set.
3040     Register ZReg = AArch64::WZR;
3041     emitCSINC(/*Dst=*/I.getOperand(1).getReg(), /*Src1=*/ZReg, /*Src2=*/ZReg,
3042               getInvertedCondCode(OpAndCC.second), MIB);
3043     I.eraseFromParent();
3044     return true;
3045   }
3046 
3047   case TargetOpcode::G_PTRMASK: {
3048     Register MaskReg = I.getOperand(2).getReg();
3049     Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
3050     // TODO: Implement arbitrary cases
3051     if (!MaskVal || !isShiftedMask_64(*MaskVal))
3052       return false;
3053 
3054     uint64_t Mask = *MaskVal;
3055     I.setDesc(TII.get(AArch64::ANDXri));
3056     I.getOperand(2).ChangeToImmediate(
3057         AArch64_AM::encodeLogicalImmediate(Mask, 64));
3058 
3059     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3060   }
3061   case TargetOpcode::G_PTRTOINT:
3062   case TargetOpcode::G_TRUNC: {
3063     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3064     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3065 
3066     const Register DstReg = I.getOperand(0).getReg();
3067     const Register SrcReg = I.getOperand(1).getReg();
3068 
3069     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3070     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3071 
3072     if (DstRB.getID() != SrcRB.getID()) {
3073       LLVM_DEBUG(
3074           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3075       return false;
3076     }
3077 
3078     if (DstRB.getID() == AArch64::GPRRegBankID) {
3079       const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3080       if (!DstRC)
3081         return false;
3082 
3083       const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(SrcTy, SrcRB);
3084       if (!SrcRC)
3085         return false;
3086 
3087       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3088           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3089         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3090         return false;
3091       }
3092 
3093       if (DstRC == SrcRC) {
3094         // Nothing to be done
3095       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3096                  SrcTy == LLT::scalar(64)) {
3097         llvm_unreachable("TableGen can import this case");
3098         return false;
3099       } else if (DstRC == &AArch64::GPR32RegClass &&
3100                  SrcRC == &AArch64::GPR64RegClass) {
3101         I.getOperand(1).setSubReg(AArch64::sub_32);
3102       } else {
3103         LLVM_DEBUG(
3104             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3105         return false;
3106       }
3107 
3108       I.setDesc(TII.get(TargetOpcode::COPY));
3109       return true;
3110     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3111       if (DstTy == LLT::fixed_vector(4, 16) &&
3112           SrcTy == LLT::fixed_vector(4, 32)) {
3113         I.setDesc(TII.get(AArch64::XTNv4i16));
3114         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3115         return true;
3116       }
3117 
3118       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3119         MachineInstr *Extract = emitExtractVectorElt(
3120             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3121         if (!Extract)
3122           return false;
3123         I.eraseFromParent();
3124         return true;
3125       }
3126 
3127       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3128       if (Opcode == TargetOpcode::G_PTRTOINT) {
3129         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3130         I.setDesc(TII.get(TargetOpcode::COPY));
3131         return selectCopy(I, TII, MRI, TRI, RBI);
3132       }
3133     }
3134 
3135     return false;
3136   }
3137 
3138   case TargetOpcode::G_ANYEXT: {
3139     if (selectUSMovFromExtend(I, MRI))
3140       return true;
3141 
3142     const Register DstReg = I.getOperand(0).getReg();
3143     const Register SrcReg = I.getOperand(1).getReg();
3144 
3145     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3146     if (RBDst.getID() != AArch64::GPRRegBankID) {
3147       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3148                         << ", expected: GPR\n");
3149       return false;
3150     }
3151 
3152     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3153     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3154       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3155                         << ", expected: GPR\n");
3156       return false;
3157     }
3158 
3159     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3160 
3161     if (DstSize == 0) {
3162       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3163       return false;
3164     }
3165 
3166     if (DstSize != 64 && DstSize > 32) {
3167       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3168                         << ", expected: 32 or 64\n");
3169       return false;
3170     }
3171     // At this point G_ANYEXT is just like a plain COPY, but we need
3172     // to explicitly form the 64-bit value if any.
3173     if (DstSize > 32) {
3174       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3175       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3176           .addDef(ExtSrc)
3177           .addImm(0)
3178           .addUse(SrcReg)
3179           .addImm(AArch64::sub_32);
3180       I.getOperand(1).setReg(ExtSrc);
3181     }
3182     return selectCopy(I, TII, MRI, TRI, RBI);
3183   }
3184 
3185   case TargetOpcode::G_ZEXT:
3186   case TargetOpcode::G_SEXT_INREG:
3187   case TargetOpcode::G_SEXT: {
3188     if (selectUSMovFromExtend(I, MRI))
3189       return true;
3190 
3191     unsigned Opcode = I.getOpcode();
3192     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3193     const Register DefReg = I.getOperand(0).getReg();
3194     Register SrcReg = I.getOperand(1).getReg();
3195     const LLT DstTy = MRI.getType(DefReg);
3196     const LLT SrcTy = MRI.getType(SrcReg);
3197     unsigned DstSize = DstTy.getSizeInBits();
3198     unsigned SrcSize = SrcTy.getSizeInBits();
3199 
3200     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3201     // extended is encoded in the imm.
3202     if (Opcode == TargetOpcode::G_SEXT_INREG)
3203       SrcSize = I.getOperand(2).getImm();
3204 
3205     if (DstTy.isVector())
3206       return false; // Should be handled by imported patterns.
3207 
3208     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3209                AArch64::GPRRegBankID &&
3210            "Unexpected ext regbank");
3211 
3212     MachineInstr *ExtI;
3213 
3214     // First check if we're extending the result of a load which has a dest type
3215     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3216     // GPR register on AArch64 and all loads which are smaller automatically
3217     // zero-extend the upper bits. E.g.
3218     // %v(s8) = G_LOAD %p, :: (load 1)
3219     // %v2(s32) = G_ZEXT %v(s8)
3220     if (!IsSigned) {
3221       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3222       bool IsGPR =
3223           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3224       if (LoadMI && IsGPR) {
3225         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3226         unsigned BytesLoaded = MemOp->getSize();
3227         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3228           return selectCopy(I, TII, MRI, TRI, RBI);
3229       }
3230 
3231       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3232       // + SUBREG_TO_REG.
3233       //
3234       // If we are zero extending from 32 bits to 64 bits, it's possible that
3235       // the instruction implicitly does the zero extend for us. In that case,
3236       // we only need the SUBREG_TO_REG.
3237       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3238         // Unlike with the G_LOAD case, we don't want to look through copies
3239         // here. (See isDef32.)
3240         MachineInstr *Def = MRI.getVRegDef(SrcReg);
3241         Register SubregToRegSrc = SrcReg;
3242 
3243         // Does the instruction implicitly zero extend?
3244         if (!Def || !isDef32(*Def)) {
3245           // No. Zero out using an OR.
3246           Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3247           const Register ZReg = AArch64::WZR;
3248           MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
3249           SubregToRegSrc = OrDst;
3250         }
3251 
3252         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3253             .addImm(0)
3254             .addUse(SubregToRegSrc)
3255             .addImm(AArch64::sub_32);
3256 
3257         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3258                                           MRI)) {
3259           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3260           return false;
3261         }
3262 
3263         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3264                                           MRI)) {
3265           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3266           return false;
3267         }
3268 
3269         I.eraseFromParent();
3270         return true;
3271       }
3272     }
3273 
3274     if (DstSize == 64) {
3275       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3276         // FIXME: Can we avoid manually doing this?
3277         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3278                                           MRI)) {
3279           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3280                             << " operand\n");
3281           return false;
3282         }
3283         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3284                                 {&AArch64::GPR64RegClass}, {})
3285                      .addImm(0)
3286                      .addUse(SrcReg)
3287                      .addImm(AArch64::sub_32)
3288                      .getReg(0);
3289       }
3290 
3291       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3292                              {DefReg}, {SrcReg})
3293                   .addImm(0)
3294                   .addImm(SrcSize - 1);
3295     } else if (DstSize <= 32) {
3296       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3297                              {DefReg}, {SrcReg})
3298                   .addImm(0)
3299                   .addImm(SrcSize - 1);
3300     } else {
3301       return false;
3302     }
3303 
3304     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3305     I.eraseFromParent();
3306     return true;
3307   }
3308 
3309   case TargetOpcode::G_SITOFP:
3310   case TargetOpcode::G_UITOFP:
3311   case TargetOpcode::G_FPTOSI:
3312   case TargetOpcode::G_FPTOUI: {
3313     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3314               SrcTy = MRI.getType(I.getOperand(1).getReg());
3315     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3316     if (NewOpc == Opcode)
3317       return false;
3318 
3319     I.setDesc(TII.get(NewOpc));
3320     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3321     I.setFlags(MachineInstr::NoFPExcept);
3322 
3323     return true;
3324   }
3325 
3326   case TargetOpcode::G_FREEZE:
3327     return selectCopy(I, TII, MRI, TRI, RBI);
3328 
3329   case TargetOpcode::G_INTTOPTR:
3330     // The importer is currently unable to import pointer types since they
3331     // didn't exist in SelectionDAG.
3332     return selectCopy(I, TII, MRI, TRI, RBI);
3333 
3334   case TargetOpcode::G_BITCAST:
3335     // Imported SelectionDAG rules can handle every bitcast except those that
3336     // bitcast from a type to the same type. Ideally, these shouldn't occur
3337     // but we might not run an optimizer that deletes them. The other exception
3338     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3339     // of them.
3340     return selectCopy(I, TII, MRI, TRI, RBI);
3341 
3342   case TargetOpcode::G_SELECT: {
3343     auto &Sel = cast<GSelect>(I);
3344     const Register CondReg = Sel.getCondReg();
3345     const Register TReg = Sel.getTrueReg();
3346     const Register FReg = Sel.getFalseReg();
3347 
3348     if (tryOptSelect(Sel))
3349       return true;
3350 
3351     // Make sure to use an unused vreg instead of wzr, so that the peephole
3352     // optimizations will be able to optimize these.
3353     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3354     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3355                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3356     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3357     if (!emitSelect(Sel.getReg(0), TReg, FReg, AArch64CC::NE, MIB))
3358       return false;
3359     Sel.eraseFromParent();
3360     return true;
3361   }
3362   case TargetOpcode::G_ICMP: {
3363     if (Ty.isVector())
3364       return selectVectorICmp(I, MRI);
3365 
3366     if (Ty != LLT::scalar(32)) {
3367       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3368                         << ", expected: " << LLT::scalar(32) << '\n');
3369       return false;
3370     }
3371 
3372     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3373     const AArch64CC::CondCode InvCC =
3374         changeICMPPredToAArch64CC(CmpInst::getInversePredicate(Pred));
3375     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1), MIB);
3376     emitCSINC(/*Dst=*/I.getOperand(0).getReg(), /*Src1=*/AArch64::WZR,
3377               /*Src2=*/AArch64::WZR, InvCC, MIB);
3378     I.eraseFromParent();
3379     return true;
3380   }
3381 
3382   case TargetOpcode::G_FCMP: {
3383     CmpInst::Predicate Pred =
3384         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3385     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3386                        Pred) ||
3387         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3388       return false;
3389     I.eraseFromParent();
3390     return true;
3391   }
3392   case TargetOpcode::G_VASTART:
3393     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3394                                 : selectVaStartAAPCS(I, MF, MRI);
3395   case TargetOpcode::G_INTRINSIC:
3396     return selectIntrinsic(I, MRI);
3397   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3398     return selectIntrinsicWithSideEffects(I, MRI);
3399   case TargetOpcode::G_IMPLICIT_DEF: {
3400     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3401     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3402     const Register DstReg = I.getOperand(0).getReg();
3403     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3404     const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(DstTy, DstRB);
3405     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3406     return true;
3407   }
3408   case TargetOpcode::G_BLOCK_ADDR: {
3409     if (TM.getCodeModel() == CodeModel::Large) {
3410       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3411       I.eraseFromParent();
3412       return true;
3413     } else {
3414       I.setDesc(TII.get(AArch64::MOVaddrBA));
3415       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3416                            I.getOperand(0).getReg())
3417                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3418                                         /* Offset */ 0, AArch64II::MO_PAGE)
3419                        .addBlockAddress(
3420                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3421                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3422       I.eraseFromParent();
3423       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3424     }
3425   }
3426   case AArch64::G_DUP: {
3427     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3428     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3429     // difficult because at RBS we may end up pessimizing the fpr case if we
3430     // decided to add an anyextend to fix this. Manual selection is the most
3431     // robust solution for now.
3432     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3433         AArch64::GPRRegBankID)
3434       return false; // We expect the fpr regbank case to be imported.
3435     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3436     if (VecTy == LLT::fixed_vector(8, 8))
3437       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3438     else if (VecTy == LLT::fixed_vector(16, 8))
3439       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3440     else if (VecTy == LLT::fixed_vector(4, 16))
3441       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3442     else if (VecTy == LLT::fixed_vector(8, 16))
3443       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3444     else
3445       return false;
3446     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3447   }
3448   case TargetOpcode::G_INTRINSIC_TRUNC:
3449     return selectIntrinsicTrunc(I, MRI);
3450   case TargetOpcode::G_INTRINSIC_ROUND:
3451     return selectIntrinsicRound(I, MRI);
3452   case TargetOpcode::G_BUILD_VECTOR:
3453     return selectBuildVector(I, MRI);
3454   case TargetOpcode::G_MERGE_VALUES:
3455     return selectMergeValues(I, MRI);
3456   case TargetOpcode::G_UNMERGE_VALUES:
3457     return selectUnmergeValues(I, MRI);
3458   case TargetOpcode::G_SHUFFLE_VECTOR:
3459     return selectShuffleVector(I, MRI);
3460   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3461     return selectExtractElt(I, MRI);
3462   case TargetOpcode::G_INSERT_VECTOR_ELT:
3463     return selectInsertElt(I, MRI);
3464   case TargetOpcode::G_CONCAT_VECTORS:
3465     return selectConcatVectors(I, MRI);
3466   case TargetOpcode::G_JUMP_TABLE:
3467     return selectJumpTable(I, MRI);
3468   case TargetOpcode::G_VECREDUCE_FADD:
3469   case TargetOpcode::G_VECREDUCE_ADD:
3470     return selectReduction(I, MRI);
3471   case TargetOpcode::G_MEMCPY:
3472   case TargetOpcode::G_MEMCPY_INLINE:
3473   case TargetOpcode::G_MEMMOVE:
3474   case TargetOpcode::G_MEMSET:
3475     assert(STI.hasMOPS() && "Shouldn't get here without +mops feature");
3476     return selectMOPS(I, MRI);
3477   }
3478 
3479   return false;
3480 }
3481 
3482 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3483                                                  MachineRegisterInfo &MRI) {
3484   Register VecReg = I.getOperand(1).getReg();
3485   LLT VecTy = MRI.getType(VecReg);
3486   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3487     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3488     // a subregister copy afterwards.
3489     if (VecTy == LLT::fixed_vector(2, 32)) {
3490       Register DstReg = I.getOperand(0).getReg();
3491       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3492                                  {VecReg, VecReg});
3493       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3494                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3495                       .getReg(0);
3496       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3497       I.eraseFromParent();
3498       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3499     }
3500 
3501     unsigned Opc = 0;
3502     if (VecTy == LLT::fixed_vector(16, 8))
3503       Opc = AArch64::ADDVv16i8v;
3504     else if (VecTy == LLT::fixed_vector(8, 16))
3505       Opc = AArch64::ADDVv8i16v;
3506     else if (VecTy == LLT::fixed_vector(4, 32))
3507       Opc = AArch64::ADDVv4i32v;
3508     else if (VecTy == LLT::fixed_vector(2, 64))
3509       Opc = AArch64::ADDPv2i64p;
3510     else {
3511       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3512       return false;
3513     }
3514     I.setDesc(TII.get(Opc));
3515     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3516   }
3517 
3518   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3519     unsigned Opc = 0;
3520     if (VecTy == LLT::fixed_vector(2, 32))
3521       Opc = AArch64::FADDPv2i32p;
3522     else if (VecTy == LLT::fixed_vector(2, 64))
3523       Opc = AArch64::FADDPv2i64p;
3524     else {
3525       LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3526       return false;
3527     }
3528     I.setDesc(TII.get(Opc));
3529     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3530   }
3531   return false;
3532 }
3533 
3534 bool AArch64InstructionSelector::selectMOPS(MachineInstr &GI,
3535                                             MachineRegisterInfo &MRI) {
3536   unsigned Mopcode;
3537   switch (GI.getOpcode()) {
3538   case TargetOpcode::G_MEMCPY:
3539   case TargetOpcode::G_MEMCPY_INLINE:
3540     Mopcode = AArch64::MOPSMemoryCopyPseudo;
3541     break;
3542   case TargetOpcode::G_MEMMOVE:
3543     Mopcode = AArch64::MOPSMemoryMovePseudo;
3544     break;
3545   case TargetOpcode::G_MEMSET:
3546     // For tagged memset see llvm.aarch64.mops.memset.tag
3547     Mopcode = AArch64::MOPSMemorySetPseudo;
3548     break;
3549   }
3550 
3551   auto &DstPtr = GI.getOperand(0);
3552   auto &SrcOrVal = GI.getOperand(1);
3553   auto &Size = GI.getOperand(2);
3554 
3555   // Create copies of the registers that can be clobbered.
3556   const Register DstPtrCopy = MRI.cloneVirtualRegister(DstPtr.getReg());
3557   const Register SrcValCopy = MRI.cloneVirtualRegister(SrcOrVal.getReg());
3558   const Register SizeCopy = MRI.cloneVirtualRegister(Size.getReg());
3559 
3560   const bool IsSet = Mopcode == AArch64::MOPSMemorySetPseudo;
3561   const auto &SrcValRegClass =
3562       IsSet ? AArch64::GPR64RegClass : AArch64::GPR64commonRegClass;
3563 
3564   // Constrain to specific registers
3565   RBI.constrainGenericRegister(DstPtrCopy, AArch64::GPR64commonRegClass, MRI);
3566   RBI.constrainGenericRegister(SrcValCopy, SrcValRegClass, MRI);
3567   RBI.constrainGenericRegister(SizeCopy, AArch64::GPR64RegClass, MRI);
3568 
3569   MIB.buildCopy(DstPtrCopy, DstPtr);
3570   MIB.buildCopy(SrcValCopy, SrcOrVal);
3571   MIB.buildCopy(SizeCopy, Size);
3572 
3573   // New instruction uses the copied registers because it must update them.
3574   // The defs are not used since they don't exist in G_MEM*. They are still
3575   // tied.
3576   // Note: order of operands is different from G_MEMSET, G_MEMCPY, G_MEMMOVE
3577   Register DefDstPtr = MRI.createVirtualRegister(&AArch64::GPR64commonRegClass);
3578   Register DefSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3579   if (IsSet) {
3580     MIB.buildInstr(Mopcode, {DefDstPtr, DefSize},
3581                    {DstPtrCopy, SizeCopy, SrcValCopy});
3582   } else {
3583     Register DefSrcPtr = MRI.createVirtualRegister(&SrcValRegClass);
3584     MIB.buildInstr(Mopcode, {DefDstPtr, DefSrcPtr, DefSize},
3585                    {DstPtrCopy, SrcValCopy, SizeCopy});
3586   }
3587 
3588   GI.eraseFromParent();
3589   return true;
3590 }
3591 
3592 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3593                                             MachineRegisterInfo &MRI) {
3594   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3595   Register JTAddr = I.getOperand(0).getReg();
3596   unsigned JTI = I.getOperand(1).getIndex();
3597   Register Index = I.getOperand(2).getReg();
3598 
3599   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3600   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3601 
3602   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3603   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3604                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3605                            .addJumpTableIndex(JTI);
3606   // Build the indirect branch.
3607   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3608   I.eraseFromParent();
3609   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3610 }
3611 
3612 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3613                                                  MachineRegisterInfo &MRI) {
3614   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3615   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3616 
3617   Register DstReg = I.getOperand(0).getReg();
3618   unsigned JTI = I.getOperand(1).getIndex();
3619   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3620   auto MovMI =
3621     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3622           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3623           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3624   I.eraseFromParent();
3625   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3626 }
3627 
3628 bool AArch64InstructionSelector::selectTLSGlobalValue(
3629     MachineInstr &I, MachineRegisterInfo &MRI) {
3630   if (!STI.isTargetMachO())
3631     return false;
3632   MachineFunction &MF = *I.getParent()->getParent();
3633   MF.getFrameInfo().setAdjustsStack(true);
3634 
3635   const auto &GlobalOp = I.getOperand(1);
3636   assert(GlobalOp.getOffset() == 0 &&
3637          "Shouldn't have an offset on TLS globals!");
3638   const GlobalValue &GV = *GlobalOp.getGlobal();
3639 
3640   auto LoadGOT =
3641       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3642           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3643 
3644   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3645                              {LoadGOT.getReg(0)})
3646                   .addImm(0);
3647 
3648   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3649   // TLS calls preserve all registers except those that absolutely must be
3650   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3651   // silly).
3652   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3653       .addUse(AArch64::X0, RegState::Implicit)
3654       .addDef(AArch64::X0, RegState::Implicit)
3655       .addRegMask(TRI.getTLSCallPreservedMask());
3656 
3657   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3658   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3659                                MRI);
3660   I.eraseFromParent();
3661   return true;
3662 }
3663 
3664 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3665     MachineInstr &I, MachineRegisterInfo &MRI) const {
3666   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3667 
3668   // Select the correct opcode.
3669   unsigned Opc = 0;
3670   if (!SrcTy.isVector()) {
3671     switch (SrcTy.getSizeInBits()) {
3672     default:
3673     case 16:
3674       Opc = AArch64::FRINTZHr;
3675       break;
3676     case 32:
3677       Opc = AArch64::FRINTZSr;
3678       break;
3679     case 64:
3680       Opc = AArch64::FRINTZDr;
3681       break;
3682     }
3683   } else {
3684     unsigned NumElts = SrcTy.getNumElements();
3685     switch (SrcTy.getElementType().getSizeInBits()) {
3686     default:
3687       break;
3688     case 16:
3689       if (NumElts == 4)
3690         Opc = AArch64::FRINTZv4f16;
3691       else if (NumElts == 8)
3692         Opc = AArch64::FRINTZv8f16;
3693       break;
3694     case 32:
3695       if (NumElts == 2)
3696         Opc = AArch64::FRINTZv2f32;
3697       else if (NumElts == 4)
3698         Opc = AArch64::FRINTZv4f32;
3699       break;
3700     case 64:
3701       if (NumElts == 2)
3702         Opc = AArch64::FRINTZv2f64;
3703       break;
3704     }
3705   }
3706 
3707   if (!Opc) {
3708     // Didn't get an opcode above, bail.
3709     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3710     return false;
3711   }
3712 
3713   // Legalization would have set us up perfectly for this; we just need to
3714   // set the opcode and move on.
3715   I.setDesc(TII.get(Opc));
3716   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3717 }
3718 
3719 bool AArch64InstructionSelector::selectIntrinsicRound(
3720     MachineInstr &I, MachineRegisterInfo &MRI) const {
3721   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3722 
3723   // Select the correct opcode.
3724   unsigned Opc = 0;
3725   if (!SrcTy.isVector()) {
3726     switch (SrcTy.getSizeInBits()) {
3727     default:
3728     case 16:
3729       Opc = AArch64::FRINTAHr;
3730       break;
3731     case 32:
3732       Opc = AArch64::FRINTASr;
3733       break;
3734     case 64:
3735       Opc = AArch64::FRINTADr;
3736       break;
3737     }
3738   } else {
3739     unsigned NumElts = SrcTy.getNumElements();
3740     switch (SrcTy.getElementType().getSizeInBits()) {
3741     default:
3742       break;
3743     case 16:
3744       if (NumElts == 4)
3745         Opc = AArch64::FRINTAv4f16;
3746       else if (NumElts == 8)
3747         Opc = AArch64::FRINTAv8f16;
3748       break;
3749     case 32:
3750       if (NumElts == 2)
3751         Opc = AArch64::FRINTAv2f32;
3752       else if (NumElts == 4)
3753         Opc = AArch64::FRINTAv4f32;
3754       break;
3755     case 64:
3756       if (NumElts == 2)
3757         Opc = AArch64::FRINTAv2f64;
3758       break;
3759     }
3760   }
3761 
3762   if (!Opc) {
3763     // Didn't get an opcode above, bail.
3764     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3765     return false;
3766   }
3767 
3768   // Legalization would have set us up perfectly for this; we just need to
3769   // set the opcode and move on.
3770   I.setDesc(TII.get(Opc));
3771   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3772 }
3773 
3774 bool AArch64InstructionSelector::selectVectorICmp(
3775     MachineInstr &I, MachineRegisterInfo &MRI) {
3776   Register DstReg = I.getOperand(0).getReg();
3777   LLT DstTy = MRI.getType(DstReg);
3778   Register SrcReg = I.getOperand(2).getReg();
3779   Register Src2Reg = I.getOperand(3).getReg();
3780   LLT SrcTy = MRI.getType(SrcReg);
3781 
3782   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3783   unsigned NumElts = DstTy.getNumElements();
3784 
3785   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3786   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3787   // Third index is cc opcode:
3788   // 0 == eq
3789   // 1 == ugt
3790   // 2 == uge
3791   // 3 == ult
3792   // 4 == ule
3793   // 5 == sgt
3794   // 6 == sge
3795   // 7 == slt
3796   // 8 == sle
3797   // ne is done by negating 'eq' result.
3798 
3799   // This table below assumes that for some comparisons the operands will be
3800   // commuted.
3801   // ult op == commute + ugt op
3802   // ule op == commute + uge op
3803   // slt op == commute + sgt op
3804   // sle op == commute + sge op
3805   unsigned PredIdx = 0;
3806   bool SwapOperands = false;
3807   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3808   switch (Pred) {
3809   case CmpInst::ICMP_NE:
3810   case CmpInst::ICMP_EQ:
3811     PredIdx = 0;
3812     break;
3813   case CmpInst::ICMP_UGT:
3814     PredIdx = 1;
3815     break;
3816   case CmpInst::ICMP_UGE:
3817     PredIdx = 2;
3818     break;
3819   case CmpInst::ICMP_ULT:
3820     PredIdx = 3;
3821     SwapOperands = true;
3822     break;
3823   case CmpInst::ICMP_ULE:
3824     PredIdx = 4;
3825     SwapOperands = true;
3826     break;
3827   case CmpInst::ICMP_SGT:
3828     PredIdx = 5;
3829     break;
3830   case CmpInst::ICMP_SGE:
3831     PredIdx = 6;
3832     break;
3833   case CmpInst::ICMP_SLT:
3834     PredIdx = 7;
3835     SwapOperands = true;
3836     break;
3837   case CmpInst::ICMP_SLE:
3838     PredIdx = 8;
3839     SwapOperands = true;
3840     break;
3841   default:
3842     llvm_unreachable("Unhandled icmp predicate");
3843     return false;
3844   }
3845 
3846   // This table obviously should be tablegen'd when we have our GISel native
3847   // tablegen selector.
3848 
3849   static const unsigned OpcTable[4][4][9] = {
3850       {
3851           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3852            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3853            0 /* invalid */},
3854           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3855            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3856            0 /* invalid */},
3857           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3858            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3859            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3860           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3861            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3862            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3863       },
3864       {
3865           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3866            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3867            0 /* invalid */},
3868           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3869            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3870            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3871           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3872            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3873            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3874           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3875            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3876            0 /* invalid */}
3877       },
3878       {
3879           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3880            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3881            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3882           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3883            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3884            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3885           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3886            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3887            0 /* invalid */},
3888           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3889            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3890            0 /* invalid */}
3891       },
3892       {
3893           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3894            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3895            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3896           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3897            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3898            0 /* invalid */},
3899           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3900            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3901            0 /* invalid */},
3902           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3903            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3904            0 /* invalid */}
3905       },
3906   };
3907   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3908   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3909   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3910   if (!Opc) {
3911     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3912     return false;
3913   }
3914 
3915   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3916   const TargetRegisterClass *SrcRC =
3917       getRegClassForTypeOnBank(SrcTy, VecRB, true);
3918   if (!SrcRC) {
3919     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3920     return false;
3921   }
3922 
3923   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3924   if (SrcTy.getSizeInBits() == 128)
3925     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3926 
3927   if (SwapOperands)
3928     std::swap(SrcReg, Src2Reg);
3929 
3930   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3931   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3932 
3933   // Invert if we had a 'ne' cc.
3934   if (NotOpc) {
3935     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3936     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3937   } else {
3938     MIB.buildCopy(DstReg, Cmp.getReg(0));
3939   }
3940   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3941   I.eraseFromParent();
3942   return true;
3943 }
3944 
3945 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3946     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3947     MachineIRBuilder &MIRBuilder) const {
3948   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3949 
3950   auto BuildFn = [&](unsigned SubregIndex) {
3951     auto Ins =
3952         MIRBuilder
3953             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3954             .addImm(SubregIndex);
3955     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3956     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3957     return &*Ins;
3958   };
3959 
3960   switch (EltSize) {
3961   case 16:
3962     return BuildFn(AArch64::hsub);
3963   case 32:
3964     return BuildFn(AArch64::ssub);
3965   case 64:
3966     return BuildFn(AArch64::dsub);
3967   default:
3968     return nullptr;
3969   }
3970 }
3971 
3972 bool AArch64InstructionSelector::selectMergeValues(
3973     MachineInstr &I, MachineRegisterInfo &MRI) {
3974   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3975   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3976   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3977   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3978   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3979 
3980   if (I.getNumOperands() != 3)
3981     return false;
3982 
3983   // Merging 2 s64s into an s128.
3984   if (DstTy == LLT::scalar(128)) {
3985     if (SrcTy.getSizeInBits() != 64)
3986       return false;
3987     Register DstReg = I.getOperand(0).getReg();
3988     Register Src1Reg = I.getOperand(1).getReg();
3989     Register Src2Reg = I.getOperand(2).getReg();
3990     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3991     MachineInstr *InsMI =
3992         emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3993     if (!InsMI)
3994       return false;
3995     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3996                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3997     if (!Ins2MI)
3998       return false;
3999     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4000     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
4001     I.eraseFromParent();
4002     return true;
4003   }
4004 
4005   if (RB.getID() != AArch64::GPRRegBankID)
4006     return false;
4007 
4008   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
4009     return false;
4010 
4011   auto *DstRC = &AArch64::GPR64RegClass;
4012   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
4013   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4014                                     TII.get(TargetOpcode::SUBREG_TO_REG))
4015                                 .addDef(SubToRegDef)
4016                                 .addImm(0)
4017                                 .addUse(I.getOperand(1).getReg())
4018                                 .addImm(AArch64::sub_32);
4019   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
4020   // Need to anyext the second scalar before we can use bfm
4021   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
4022                                     TII.get(TargetOpcode::SUBREG_TO_REG))
4023                                 .addDef(SubToRegDef2)
4024                                 .addImm(0)
4025                                 .addUse(I.getOperand(2).getReg())
4026                                 .addImm(AArch64::sub_32);
4027   MachineInstr &BFM =
4028       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
4029            .addDef(I.getOperand(0).getReg())
4030            .addUse(SubToRegDef)
4031            .addUse(SubToRegDef2)
4032            .addImm(32)
4033            .addImm(31);
4034   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
4035   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
4036   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
4037   I.eraseFromParent();
4038   return true;
4039 }
4040 
4041 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
4042                               const unsigned EltSize) {
4043   // Choose a lane copy opcode and subregister based off of the size of the
4044   // vector's elements.
4045   switch (EltSize) {
4046   case 8:
4047     CopyOpc = AArch64::DUPi8;
4048     ExtractSubReg = AArch64::bsub;
4049     break;
4050   case 16:
4051     CopyOpc = AArch64::DUPi16;
4052     ExtractSubReg = AArch64::hsub;
4053     break;
4054   case 32:
4055     CopyOpc = AArch64::DUPi32;
4056     ExtractSubReg = AArch64::ssub;
4057     break;
4058   case 64:
4059     CopyOpc = AArch64::DUPi64;
4060     ExtractSubReg = AArch64::dsub;
4061     break;
4062   default:
4063     // Unknown size, bail out.
4064     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
4065     return false;
4066   }
4067   return true;
4068 }
4069 
4070 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
4071     Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
4072     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
4073   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4074   unsigned CopyOpc = 0;
4075   unsigned ExtractSubReg = 0;
4076   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
4077     LLVM_DEBUG(
4078         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
4079     return nullptr;
4080   }
4081 
4082   const TargetRegisterClass *DstRC =
4083       getRegClassForTypeOnBank(ScalarTy, DstRB, true);
4084   if (!DstRC) {
4085     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
4086     return nullptr;
4087   }
4088 
4089   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
4090   const LLT &VecTy = MRI.getType(VecReg);
4091   const TargetRegisterClass *VecRC =
4092       getRegClassForTypeOnBank(VecTy, VecRB, true);
4093   if (!VecRC) {
4094     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
4095     return nullptr;
4096   }
4097 
4098   // The register that we're going to copy into.
4099   Register InsertReg = VecReg;
4100   if (!DstReg)
4101     DstReg = MRI.createVirtualRegister(DstRC);
4102   // If the lane index is 0, we just use a subregister COPY.
4103   if (LaneIdx == 0) {
4104     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
4105                     .addReg(VecReg, 0, ExtractSubReg);
4106     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4107     return &*Copy;
4108   }
4109 
4110   // Lane copies require 128-bit wide registers. If we're dealing with an
4111   // unpacked vector, then we need to move up to that width. Insert an implicit
4112   // def and a subregister insert to get us there.
4113   if (VecTy.getSizeInBits() != 128) {
4114     MachineInstr *ScalarToVector = emitScalarToVector(
4115         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
4116     if (!ScalarToVector)
4117       return nullptr;
4118     InsertReg = ScalarToVector->getOperand(0).getReg();
4119   }
4120 
4121   MachineInstr *LaneCopyMI =
4122       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
4123   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
4124 
4125   // Make sure that we actually constrain the initial copy.
4126   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
4127   return LaneCopyMI;
4128 }
4129 
4130 bool AArch64InstructionSelector::selectExtractElt(
4131     MachineInstr &I, MachineRegisterInfo &MRI) {
4132   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4133          "unexpected opcode!");
4134   Register DstReg = I.getOperand(0).getReg();
4135   const LLT NarrowTy = MRI.getType(DstReg);
4136   const Register SrcReg = I.getOperand(1).getReg();
4137   const LLT WideTy = MRI.getType(SrcReg);
4138   (void)WideTy;
4139   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4140          "source register size too small!");
4141   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4142 
4143   // Need the lane index to determine the correct copy opcode.
4144   MachineOperand &LaneIdxOp = I.getOperand(2);
4145   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4146 
4147   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4148     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4149     return false;
4150   }
4151 
4152   // Find the index to extract from.
4153   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4154   if (!VRegAndVal)
4155     return false;
4156   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4157 
4158 
4159   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4160   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4161                                                LaneIdx, MIB);
4162   if (!Extract)
4163     return false;
4164 
4165   I.eraseFromParent();
4166   return true;
4167 }
4168 
4169 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4170     MachineInstr &I, MachineRegisterInfo &MRI) {
4171   unsigned NumElts = I.getNumOperands() - 1;
4172   Register SrcReg = I.getOperand(NumElts).getReg();
4173   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4174   const LLT SrcTy = MRI.getType(SrcReg);
4175 
4176   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4177   if (SrcTy.getSizeInBits() > 128) {
4178     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4179     return false;
4180   }
4181 
4182   // We implement a split vector operation by treating the sub-vectors as
4183   // scalars and extracting them.
4184   const RegisterBank &DstRB =
4185       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4186   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4187     Register Dst = I.getOperand(OpIdx).getReg();
4188     MachineInstr *Extract =
4189         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4190     if (!Extract)
4191       return false;
4192   }
4193   I.eraseFromParent();
4194   return true;
4195 }
4196 
4197 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4198                                                      MachineRegisterInfo &MRI) {
4199   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4200          "unexpected opcode");
4201 
4202   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4203   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4204           AArch64::FPRRegBankID ||
4205       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4206           AArch64::FPRRegBankID) {
4207     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4208                          "currently unsupported.\n");
4209     return false;
4210   }
4211 
4212   // The last operand is the vector source register, and every other operand is
4213   // a register to unpack into.
4214   unsigned NumElts = I.getNumOperands() - 1;
4215   Register SrcReg = I.getOperand(NumElts).getReg();
4216   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4217   const LLT WideTy = MRI.getType(SrcReg);
4218   (void)WideTy;
4219   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4220          "can only unmerge from vector or s128 types!");
4221   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4222          "source register size too small!");
4223 
4224   if (!NarrowTy.isScalar())
4225     return selectSplitVectorUnmerge(I, MRI);
4226 
4227   // Choose a lane copy opcode and subregister based off of the size of the
4228   // vector's elements.
4229   unsigned CopyOpc = 0;
4230   unsigned ExtractSubReg = 0;
4231   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4232     return false;
4233 
4234   // Set up for the lane copies.
4235   MachineBasicBlock &MBB = *I.getParent();
4236 
4237   // Stores the registers we'll be copying from.
4238   SmallVector<Register, 4> InsertRegs;
4239 
4240   // We'll use the first register twice, so we only need NumElts-1 registers.
4241   unsigned NumInsertRegs = NumElts - 1;
4242 
4243   // If our elements fit into exactly 128 bits, then we can copy from the source
4244   // directly. Otherwise, we need to do a bit of setup with some subregister
4245   // inserts.
4246   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4247     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4248   } else {
4249     // No. We have to perform subregister inserts. For each insert, create an
4250     // implicit def and a subregister insert, and save the register we create.
4251     const TargetRegisterClass *RC = getRegClassForTypeOnBank(
4252         LLT::fixed_vector(NumElts, WideTy.getScalarSizeInBits()),
4253         *RBI.getRegBank(SrcReg, MRI, TRI));
4254     unsigned SubReg = 0;
4255     bool Found = getSubRegForClass(RC, TRI, SubReg);
4256     (void)Found;
4257     assert(Found && "expected to find last operand's subeg idx");
4258     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4259       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4260       MachineInstr &ImpDefMI =
4261           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4262                    ImpDefReg);
4263 
4264       // Now, create the subregister insert from SrcReg.
4265       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4266       MachineInstr &InsMI =
4267           *BuildMI(MBB, I, I.getDebugLoc(),
4268                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4269                .addUse(ImpDefReg)
4270                .addUse(SrcReg)
4271                .addImm(SubReg);
4272 
4273       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4274       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4275 
4276       // Save the register so that we can copy from it after.
4277       InsertRegs.push_back(InsertReg);
4278     }
4279   }
4280 
4281   // Now that we've created any necessary subregister inserts, we can
4282   // create the copies.
4283   //
4284   // Perform the first copy separately as a subregister copy.
4285   Register CopyTo = I.getOperand(0).getReg();
4286   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4287                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4288   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4289 
4290   // Now, perform the remaining copies as vector lane copies.
4291   unsigned LaneIdx = 1;
4292   for (Register InsReg : InsertRegs) {
4293     Register CopyTo = I.getOperand(LaneIdx).getReg();
4294     MachineInstr &CopyInst =
4295         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4296              .addUse(InsReg)
4297              .addImm(LaneIdx);
4298     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4299     ++LaneIdx;
4300   }
4301 
4302   // Separately constrain the first copy's destination. Because of the
4303   // limitation in constrainOperandRegClass, we can't guarantee that this will
4304   // actually be constrained. So, do it ourselves using the second operand.
4305   const TargetRegisterClass *RC =
4306       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4307   if (!RC) {
4308     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4309     return false;
4310   }
4311 
4312   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4313   I.eraseFromParent();
4314   return true;
4315 }
4316 
4317 bool AArch64InstructionSelector::selectConcatVectors(
4318     MachineInstr &I, MachineRegisterInfo &MRI)  {
4319   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4320          "Unexpected opcode");
4321   Register Dst = I.getOperand(0).getReg();
4322   Register Op1 = I.getOperand(1).getReg();
4323   Register Op2 = I.getOperand(2).getReg();
4324   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4325   if (!ConcatMI)
4326     return false;
4327   I.eraseFromParent();
4328   return true;
4329 }
4330 
4331 unsigned
4332 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4333                                                   MachineFunction &MF) const {
4334   Type *CPTy = CPVal->getType();
4335   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4336 
4337   MachineConstantPool *MCP = MF.getConstantPool();
4338   return MCP->getConstantPoolIndex(CPVal, Alignment);
4339 }
4340 
4341 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4342     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4343   auto &MF = MIRBuilder.getMF();
4344   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4345 
4346   auto Adrp =
4347       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4348           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4349 
4350   MachineInstr *LoadMI = nullptr;
4351   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4352   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4353   switch (Size) {
4354   case 16:
4355     LoadMI =
4356         &*MIRBuilder
4357               .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4358               .addConstantPoolIndex(CPIdx, 0,
4359                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4360     break;
4361   case 8:
4362     LoadMI =
4363         &*MIRBuilder
4364               .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4365               .addConstantPoolIndex(CPIdx, 0,
4366                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4367     break;
4368   case 4:
4369     LoadMI =
4370         &*MIRBuilder
4371               .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4372               .addConstantPoolIndex(CPIdx, 0,
4373                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4374     break;
4375   case 2:
4376     LoadMI =
4377         &*MIRBuilder
4378               .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp})
4379               .addConstantPoolIndex(CPIdx, 0,
4380                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4381     break;
4382   default:
4383     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4384                       << *CPVal->getType());
4385     return nullptr;
4386   }
4387   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4388                                                     MachineMemOperand::MOLoad,
4389                                                     Size, Align(Size)));
4390   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4391   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4392   return LoadMI;
4393 }
4394 
4395 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4396 /// size and RB.
4397 static std::pair<unsigned, unsigned>
4398 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4399   unsigned Opc, SubregIdx;
4400   if (RB.getID() == AArch64::GPRRegBankID) {
4401     if (EltSize == 16) {
4402       Opc = AArch64::INSvi16gpr;
4403       SubregIdx = AArch64::ssub;
4404     } else if (EltSize == 32) {
4405       Opc = AArch64::INSvi32gpr;
4406       SubregIdx = AArch64::ssub;
4407     } else if (EltSize == 64) {
4408       Opc = AArch64::INSvi64gpr;
4409       SubregIdx = AArch64::dsub;
4410     } else {
4411       llvm_unreachable("invalid elt size!");
4412     }
4413   } else {
4414     if (EltSize == 8) {
4415       Opc = AArch64::INSvi8lane;
4416       SubregIdx = AArch64::bsub;
4417     } else if (EltSize == 16) {
4418       Opc = AArch64::INSvi16lane;
4419       SubregIdx = AArch64::hsub;
4420     } else if (EltSize == 32) {
4421       Opc = AArch64::INSvi32lane;
4422       SubregIdx = AArch64::ssub;
4423     } else if (EltSize == 64) {
4424       Opc = AArch64::INSvi64lane;
4425       SubregIdx = AArch64::dsub;
4426     } else {
4427       llvm_unreachable("invalid elt size!");
4428     }
4429   }
4430   return std::make_pair(Opc, SubregIdx);
4431 }
4432 
4433 MachineInstr *AArch64InstructionSelector::emitInstr(
4434     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4435     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4436     const ComplexRendererFns &RenderFns) const {
4437   assert(Opcode && "Expected an opcode?");
4438   assert(!isPreISelGenericOpcode(Opcode) &&
4439          "Function should only be used to produce selected instructions!");
4440   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4441   if (RenderFns)
4442     for (auto &Fn : *RenderFns)
4443       Fn(MI);
4444   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4445   return &*MI;
4446 }
4447 
4448 MachineInstr *AArch64InstructionSelector::emitAddSub(
4449     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4450     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4451     MachineIRBuilder &MIRBuilder) const {
4452   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4453   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4454   auto Ty = MRI.getType(LHS.getReg());
4455   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4456   unsigned Size = Ty.getSizeInBits();
4457   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4458   bool Is32Bit = Size == 32;
4459 
4460   // INSTRri form with positive arithmetic immediate.
4461   if (auto Fns = selectArithImmed(RHS))
4462     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4463                      MIRBuilder, Fns);
4464 
4465   // INSTRri form with negative arithmetic immediate.
4466   if (auto Fns = selectNegArithImmed(RHS))
4467     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4468                      MIRBuilder, Fns);
4469 
4470   // INSTRrx form.
4471   if (auto Fns = selectArithExtendedRegister(RHS))
4472     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4473                      MIRBuilder, Fns);
4474 
4475   // INSTRrs form.
4476   if (auto Fns = selectShiftedRegister(RHS))
4477     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4478                      MIRBuilder, Fns);
4479   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4480                    MIRBuilder);
4481 }
4482 
4483 MachineInstr *
4484 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4485                                     MachineOperand &RHS,
4486                                     MachineIRBuilder &MIRBuilder) const {
4487   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4488       {{AArch64::ADDXri, AArch64::ADDWri},
4489        {AArch64::ADDXrs, AArch64::ADDWrs},
4490        {AArch64::ADDXrr, AArch64::ADDWrr},
4491        {AArch64::SUBXri, AArch64::SUBWri},
4492        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4493   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4494 }
4495 
4496 MachineInstr *
4497 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4498                                      MachineOperand &RHS,
4499                                      MachineIRBuilder &MIRBuilder) const {
4500   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4501       {{AArch64::ADDSXri, AArch64::ADDSWri},
4502        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4503        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4504        {AArch64::SUBSXri, AArch64::SUBSWri},
4505        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4506   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4507 }
4508 
4509 MachineInstr *
4510 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4511                                      MachineOperand &RHS,
4512                                      MachineIRBuilder &MIRBuilder) const {
4513   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4514       {{AArch64::SUBSXri, AArch64::SUBSWri},
4515        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4516        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4517        {AArch64::ADDSXri, AArch64::ADDSWri},
4518        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4519   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4520 }
4521 
4522 MachineInstr *
4523 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4524                                     MachineIRBuilder &MIRBuilder) const {
4525   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4526   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4527   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4528   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4529 }
4530 
4531 MachineInstr *
4532 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4533                                     MachineIRBuilder &MIRBuilder) const {
4534   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4535   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4536   LLT Ty = MRI.getType(LHS.getReg());
4537   unsigned RegSize = Ty.getSizeInBits();
4538   bool Is32Bit = (RegSize == 32);
4539   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4540                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4541                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4542   // ANDS needs a logical immediate for its immediate form. Check if we can
4543   // fold one in.
4544   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4545     int64_t Imm = ValAndVReg->Value.getSExtValue();
4546 
4547     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4548       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4549       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4550       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4551       return &*TstMI;
4552     }
4553   }
4554 
4555   if (auto Fns = selectLogicalShiftedRegister(RHS))
4556     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4557   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4558 }
4559 
4560 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4561     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4562     MachineIRBuilder &MIRBuilder) const {
4563   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4564   assert(Predicate.isPredicate() && "Expected predicate?");
4565   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4566   LLT CmpTy = MRI.getType(LHS.getReg());
4567   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4568   unsigned Size = CmpTy.getSizeInBits();
4569   (void)Size;
4570   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4571   // Fold the compare into a cmn or tst if possible.
4572   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4573     return FoldCmp;
4574   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4575   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4576 }
4577 
4578 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4579     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4580   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4581 #ifndef NDEBUG
4582   LLT Ty = MRI.getType(Dst);
4583   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4584          "Expected a 32-bit scalar register?");
4585 #endif
4586   const Register ZReg = AArch64::WZR;
4587   AArch64CC::CondCode CC1, CC2;
4588   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4589   auto InvCC1 = AArch64CC::getInvertedCondCode(CC1);
4590   if (CC2 == AArch64CC::AL)
4591     return emitCSINC(/*Dst=*/Dst, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1,
4592                      MIRBuilder);
4593   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4594   Register Def1Reg = MRI.createVirtualRegister(RC);
4595   Register Def2Reg = MRI.createVirtualRegister(RC);
4596   auto InvCC2 = AArch64CC::getInvertedCondCode(CC2);
4597   emitCSINC(/*Dst=*/Def1Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC1, MIRBuilder);
4598   emitCSINC(/*Dst=*/Def2Reg, /*Src1=*/ZReg, /*Src2=*/ZReg, InvCC2, MIRBuilder);
4599   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4600   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4601   return &*OrMI;
4602 }
4603 
4604 MachineInstr *
4605 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4606                                           MachineIRBuilder &MIRBuilder,
4607                                           Optional<CmpInst::Predicate> Pred) const {
4608   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4609   LLT Ty = MRI.getType(LHS);
4610   if (Ty.isVector())
4611     return nullptr;
4612   unsigned OpSize = Ty.getSizeInBits();
4613   if (OpSize != 32 && OpSize != 64)
4614     return nullptr;
4615 
4616   // If this is a compare against +0.0, then we don't have
4617   // to explicitly materialize a constant.
4618   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4619   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4620 
4621   auto IsEqualityPred = [](CmpInst::Predicate P) {
4622     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4623            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4624   };
4625   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4626     // Try commutating the operands.
4627     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4628     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4629       ShouldUseImm = true;
4630       std::swap(LHS, RHS);
4631     }
4632   }
4633   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4634                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4635   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4636 
4637   // Partially build the compare. Decide if we need to add a use for the
4638   // third operand based off whether or not we're comparing against 0.0.
4639   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4640   CmpMI.setMIFlags(MachineInstr::NoFPExcept);
4641   if (!ShouldUseImm)
4642     CmpMI.addUse(RHS);
4643   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4644   return &*CmpMI;
4645 }
4646 
4647 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4648     Optional<Register> Dst, Register Op1, Register Op2,
4649     MachineIRBuilder &MIRBuilder) const {
4650   // We implement a vector concat by:
4651   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4652   // 2. Insert the upper vector into the destination's upper element
4653   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4654   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4655 
4656   const LLT Op1Ty = MRI.getType(Op1);
4657   const LLT Op2Ty = MRI.getType(Op2);
4658 
4659   if (Op1Ty != Op2Ty) {
4660     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4661     return nullptr;
4662   }
4663   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4664 
4665   if (Op1Ty.getSizeInBits() >= 128) {
4666     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4667     return nullptr;
4668   }
4669 
4670   // At the moment we just support 64 bit vector concats.
4671   if (Op1Ty.getSizeInBits() != 64) {
4672     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4673     return nullptr;
4674   }
4675 
4676   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4677   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4678   const TargetRegisterClass *DstRC =
4679       getRegClassForTypeOnBank(Op1Ty.multiplyElements(2), FPRBank);
4680 
4681   MachineInstr *WidenedOp1 =
4682       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4683   MachineInstr *WidenedOp2 =
4684       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4685   if (!WidenedOp1 || !WidenedOp2) {
4686     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4687     return nullptr;
4688   }
4689 
4690   // Now do the insert of the upper element.
4691   unsigned InsertOpc, InsSubRegIdx;
4692   std::tie(InsertOpc, InsSubRegIdx) =
4693       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4694 
4695   if (!Dst)
4696     Dst = MRI.createVirtualRegister(DstRC);
4697   auto InsElt =
4698       MIRBuilder
4699           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4700           .addImm(1) /* Lane index */
4701           .addUse(WidenedOp2->getOperand(0).getReg())
4702           .addImm(0);
4703   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4704   return &*InsElt;
4705 }
4706 
4707 MachineInstr *
4708 AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
4709                                       Register Src2, AArch64CC::CondCode Pred,
4710                                       MachineIRBuilder &MIRBuilder) const {
4711   auto &MRI = *MIRBuilder.getMRI();
4712   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Dst);
4713   // If we used a register class, then this won't necessarily have an LLT.
4714   // Compute the size based off whether or not we have a class or bank.
4715   unsigned Size;
4716   if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4717     Size = TRI.getRegSizeInBits(*RC);
4718   else
4719     Size = MRI.getType(Dst).getSizeInBits();
4720   // Some opcodes use s1.
4721   assert(Size <= 64 && "Expected 64 bits or less only!");
4722   static const unsigned OpcTable[2] = {AArch64::CSINCWr, AArch64::CSINCXr};
4723   unsigned Opc = OpcTable[Size == 64];
4724   auto CSINC = MIRBuilder.buildInstr(Opc, {Dst}, {Src1, Src2}).addImm(Pred);
4725   constrainSelectedInstRegOperands(*CSINC, TII, TRI, RBI);
4726   return &*CSINC;
4727 }
4728 
4729 std::pair<MachineInstr *, AArch64CC::CondCode>
4730 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4731                                            MachineOperand &LHS,
4732                                            MachineOperand &RHS,
4733                                            MachineIRBuilder &MIRBuilder) const {
4734   switch (Opcode) {
4735   default:
4736     llvm_unreachable("Unexpected opcode!");
4737   case TargetOpcode::G_SADDO:
4738     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4739   case TargetOpcode::G_UADDO:
4740     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4741   case TargetOpcode::G_SSUBO:
4742     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4743   case TargetOpcode::G_USUBO:
4744     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4745   }
4746 }
4747 
4748 /// Returns true if @p Val is a tree of AND/OR/CMP operations that can be
4749 /// expressed as a conjunction.
4750 /// \param CanNegate    Set to true if we can negate the whole sub-tree just by
4751 ///                     changing the conditions on the CMP tests.
4752 ///                     (this means we can call emitConjunctionRec() with
4753 ///                      Negate==true on this sub-tree)
4754 /// \param MustBeFirst  Set to true if this subtree needs to be negated and we
4755 ///                     cannot do the negation naturally. We are required to
4756 ///                     emit the subtree first in this case.
4757 /// \param WillNegate   Is true if are called when the result of this
4758 ///                     subexpression must be negated. This happens when the
4759 ///                     outer expression is an OR. We can use this fact to know
4760 ///                     that we have a double negation (or (or ...) ...) that
4761 ///                     can be implemented for free.
4762 static bool canEmitConjunction(Register Val, bool &CanNegate, bool &MustBeFirst,
4763                                bool WillNegate, MachineRegisterInfo &MRI,
4764                                unsigned Depth = 0) {
4765   if (!MRI.hasOneNonDBGUse(Val))
4766     return false;
4767   MachineInstr *ValDef = MRI.getVRegDef(Val);
4768   unsigned Opcode = ValDef->getOpcode();
4769   if (isa<GAnyCmp>(ValDef)) {
4770     CanNegate = true;
4771     MustBeFirst = false;
4772     return true;
4773   }
4774   // Protect against exponential runtime and stack overflow.
4775   if (Depth > 6)
4776     return false;
4777   if (Opcode == TargetOpcode::G_AND || Opcode == TargetOpcode::G_OR) {
4778     bool IsOR = Opcode == TargetOpcode::G_OR;
4779     Register O0 = ValDef->getOperand(1).getReg();
4780     Register O1 = ValDef->getOperand(2).getReg();
4781     bool CanNegateL;
4782     bool MustBeFirstL;
4783     if (!canEmitConjunction(O0, CanNegateL, MustBeFirstL, IsOR, MRI, Depth + 1))
4784       return false;
4785     bool CanNegateR;
4786     bool MustBeFirstR;
4787     if (!canEmitConjunction(O1, CanNegateR, MustBeFirstR, IsOR, MRI, Depth + 1))
4788       return false;
4789 
4790     if (MustBeFirstL && MustBeFirstR)
4791       return false;
4792 
4793     if (IsOR) {
4794       // For an OR expression we need to be able to naturally negate at least
4795       // one side or we cannot do the transformation at all.
4796       if (!CanNegateL && !CanNegateR)
4797         return false;
4798       // If we the result of the OR will be negated and we can naturally negate
4799       // the leaves, then this sub-tree as a whole negates naturally.
4800       CanNegate = WillNegate && CanNegateL && CanNegateR;
4801       // If we cannot naturally negate the whole sub-tree, then this must be
4802       // emitted first.
4803       MustBeFirst = !CanNegate;
4804     } else {
4805       assert(Opcode == TargetOpcode::G_AND && "Must be G_AND");
4806       // We cannot naturally negate an AND operation.
4807       CanNegate = false;
4808       MustBeFirst = MustBeFirstL || MustBeFirstR;
4809     }
4810     return true;
4811   }
4812   return false;
4813 }
4814 
4815 MachineInstr *AArch64InstructionSelector::emitConditionalComparison(
4816     Register LHS, Register RHS, CmpInst::Predicate CC,
4817     AArch64CC::CondCode Predicate, AArch64CC::CondCode OutCC,
4818     MachineIRBuilder &MIB) const {
4819   // TODO: emit CMN as an optimization.
4820   auto &MRI = *MIB.getMRI();
4821   LLT OpTy = MRI.getType(LHS);
4822   assert(OpTy.getSizeInBits() == 32 || OpTy.getSizeInBits() == 64);
4823   unsigned CCmpOpc;
4824   if (CmpInst::isIntPredicate(CC)) {
4825     CCmpOpc = OpTy.getSizeInBits() == 32 ? AArch64::CCMPWr : AArch64::CCMPXr;
4826   } else {
4827     switch (OpTy.getSizeInBits()) {
4828     case 16:
4829       CCmpOpc = AArch64::FCCMPHrr;
4830       break;
4831     case 32:
4832       CCmpOpc = AArch64::FCCMPSrr;
4833       break;
4834     case 64:
4835       CCmpOpc = AArch64::FCCMPDrr;
4836       break;
4837     default:
4838       return nullptr;
4839     }
4840   }
4841   AArch64CC::CondCode InvOutCC = AArch64CC::getInvertedCondCode(OutCC);
4842   unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(InvOutCC);
4843   auto CCmp =
4844       MIB.buildInstr(CCmpOpc, {}, {LHS, RHS}).addImm(NZCV).addImm(Predicate);
4845   constrainSelectedInstRegOperands(*CCmp, TII, TRI, RBI);
4846   return &*CCmp;
4847 }
4848 
4849 MachineInstr *AArch64InstructionSelector::emitConjunctionRec(
4850     Register Val, AArch64CC::CondCode &OutCC, bool Negate, Register CCOp,
4851     AArch64CC::CondCode Predicate, MachineIRBuilder &MIB) const {
4852   // We're at a tree leaf, produce a conditional comparison operation.
4853   auto &MRI = *MIB.getMRI();
4854   MachineInstr *ValDef = MRI.getVRegDef(Val);
4855   unsigned Opcode = ValDef->getOpcode();
4856   if (auto *Cmp = dyn_cast<GAnyCmp>(ValDef)) {
4857     Register LHS = Cmp->getLHSReg();
4858     Register RHS = Cmp->getRHSReg();
4859     CmpInst::Predicate CC = Cmp->getCond();
4860     if (Negate)
4861       CC = CmpInst::getInversePredicate(CC);
4862     if (isa<GICmp>(Cmp)) {
4863       OutCC = changeICMPPredToAArch64CC(CC);
4864     } else {
4865       // Handle special FP cases.
4866       AArch64CC::CondCode ExtraCC;
4867       changeFPCCToANDAArch64CC(CC, OutCC, ExtraCC);
4868       // Some floating point conditions can't be tested with a single condition
4869       // code. Construct an additional comparison in this case.
4870       if (ExtraCC != AArch64CC::AL) {
4871         MachineInstr *ExtraCmp;
4872         if (!CCOp)
4873           ExtraCmp = emitFPCompare(LHS, RHS, MIB, CC);
4874         else
4875           ExtraCmp =
4876               emitConditionalComparison(LHS, RHS, CC, Predicate, ExtraCC, MIB);
4877         CCOp = ExtraCmp->getOperand(0).getReg();
4878         Predicate = ExtraCC;
4879       }
4880     }
4881 
4882     // Produce a normal comparison if we are first in the chain
4883     if (!CCOp) {
4884       auto Dst = MRI.cloneVirtualRegister(LHS);
4885       if (isa<GICmp>(Cmp))
4886         return emitSUBS(Dst, Cmp->getOperand(2), Cmp->getOperand(3), MIB);
4887       return emitFPCompare(Cmp->getOperand(2).getReg(),
4888                            Cmp->getOperand(3).getReg(), MIB);
4889     }
4890     // Otherwise produce a ccmp.
4891     return emitConditionalComparison(LHS, RHS, CC, Predicate, OutCC, MIB);
4892   }
4893   assert(MRI.hasOneNonDBGUse(Val) && "Valid conjunction/disjunction tree");
4894 
4895   bool IsOR = Opcode == TargetOpcode::G_OR;
4896 
4897   Register LHS = ValDef->getOperand(1).getReg();
4898   bool CanNegateL;
4899   bool MustBeFirstL;
4900   bool ValidL = canEmitConjunction(LHS, CanNegateL, MustBeFirstL, IsOR, MRI);
4901   assert(ValidL && "Valid conjunction/disjunction tree");
4902   (void)ValidL;
4903 
4904   Register RHS = ValDef->getOperand(2).getReg();
4905   bool CanNegateR;
4906   bool MustBeFirstR;
4907   bool ValidR = canEmitConjunction(RHS, CanNegateR, MustBeFirstR, IsOR, MRI);
4908   assert(ValidR && "Valid conjunction/disjunction tree");
4909   (void)ValidR;
4910 
4911   // Swap sub-tree that must come first to the right side.
4912   if (MustBeFirstL) {
4913     assert(!MustBeFirstR && "Valid conjunction/disjunction tree");
4914     std::swap(LHS, RHS);
4915     std::swap(CanNegateL, CanNegateR);
4916     std::swap(MustBeFirstL, MustBeFirstR);
4917   }
4918 
4919   bool NegateR;
4920   bool NegateAfterR;
4921   bool NegateL;
4922   bool NegateAfterAll;
4923   if (Opcode == TargetOpcode::G_OR) {
4924     // Swap the sub-tree that we can negate naturally to the left.
4925     if (!CanNegateL) {
4926       assert(CanNegateR && "at least one side must be negatable");
4927       assert(!MustBeFirstR && "invalid conjunction/disjunction tree");
4928       assert(!Negate);
4929       std::swap(LHS, RHS);
4930       NegateR = false;
4931       NegateAfterR = true;
4932     } else {
4933       // Negate the left sub-tree if possible, otherwise negate the result.
4934       NegateR = CanNegateR;
4935       NegateAfterR = !CanNegateR;
4936     }
4937     NegateL = true;
4938     NegateAfterAll = !Negate;
4939   } else {
4940     assert(Opcode == TargetOpcode::G_AND &&
4941            "Valid conjunction/disjunction tree");
4942     assert(!Negate && "Valid conjunction/disjunction tree");
4943 
4944     NegateL = false;
4945     NegateR = false;
4946     NegateAfterR = false;
4947     NegateAfterAll = false;
4948   }
4949 
4950   // Emit sub-trees.
4951   AArch64CC::CondCode RHSCC;
4952   MachineInstr *CmpR =
4953       emitConjunctionRec(RHS, RHSCC, NegateR, CCOp, Predicate, MIB);
4954   if (NegateAfterR)
4955     RHSCC = AArch64CC::getInvertedCondCode(RHSCC);
4956   MachineInstr *CmpL = emitConjunctionRec(
4957       LHS, OutCC, NegateL, CmpR->getOperand(0).getReg(), RHSCC, MIB);
4958   if (NegateAfterAll)
4959     OutCC = AArch64CC::getInvertedCondCode(OutCC);
4960   return CmpL;
4961 }
4962 
4963 MachineInstr *AArch64InstructionSelector::emitConjunction(
4964     Register Val, AArch64CC::CondCode &OutCC, MachineIRBuilder &MIB) const {
4965   bool DummyCanNegate;
4966   bool DummyMustBeFirst;
4967   if (!canEmitConjunction(Val, DummyCanNegate, DummyMustBeFirst, false,
4968                           *MIB.getMRI()))
4969     return nullptr;
4970   return emitConjunctionRec(Val, OutCC, false, Register(), AArch64CC::AL, MIB);
4971 }
4972 
4973 bool AArch64InstructionSelector::tryOptSelectConjunction(GSelect &SelI,
4974                                                          MachineInstr &CondMI) {
4975   AArch64CC::CondCode AArch64CC;
4976   MachineInstr *ConjMI = emitConjunction(SelI.getCondReg(), AArch64CC, MIB);
4977   if (!ConjMI)
4978     return false;
4979 
4980   emitSelect(SelI.getReg(0), SelI.getTrueReg(), SelI.getFalseReg(), AArch64CC, MIB);
4981   SelI.eraseFromParent();
4982   return true;
4983 }
4984 
4985 bool AArch64InstructionSelector::tryOptSelect(GSelect &I) {
4986   MachineRegisterInfo &MRI = *MIB.getMRI();
4987   // We want to recognize this pattern:
4988   //
4989   // $z = G_FCMP pred, $x, $y
4990   // ...
4991   // $w = G_SELECT $z, $a, $b
4992   //
4993   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4994   // some copies/truncs in between.)
4995   //
4996   // If we see this, then we can emit something like this:
4997   //
4998   // fcmp $x, $y
4999   // fcsel $w, $a, $b, pred
5000   //
5001   // Rather than emitting both of the rather long sequences in the standard
5002   // G_FCMP/G_SELECT select methods.
5003 
5004   // First, check if the condition is defined by a compare.
5005   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
5006 
5007   // We can only fold if all of the defs have one use.
5008   Register CondDefReg = CondDef->getOperand(0).getReg();
5009   if (!MRI.hasOneNonDBGUse(CondDefReg)) {
5010     // Unless it's another select.
5011     for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
5012       if (CondDef == &UI)
5013         continue;
5014       if (UI.getOpcode() != TargetOpcode::G_SELECT)
5015         return false;
5016     }
5017   }
5018 
5019   // Is the condition defined by a compare?
5020   unsigned CondOpc = CondDef->getOpcode();
5021   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP) {
5022     if (tryOptSelectConjunction(I, *CondDef))
5023       return true;
5024     return false;
5025   }
5026 
5027   AArch64CC::CondCode CondCode;
5028   if (CondOpc == TargetOpcode::G_ICMP) {
5029     auto Pred =
5030         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5031     CondCode = changeICMPPredToAArch64CC(Pred);
5032     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
5033                        CondDef->getOperand(1), MIB);
5034   } else {
5035     // Get the condition code for the select.
5036     auto Pred =
5037         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
5038     AArch64CC::CondCode CondCode2;
5039     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
5040 
5041     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
5042     // instructions to emit the comparison.
5043     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
5044     // unnecessary.
5045     if (CondCode2 != AArch64CC::AL)
5046       return false;
5047 
5048     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
5049                        CondDef->getOperand(3).getReg(), MIB)) {
5050       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
5051       return false;
5052     }
5053   }
5054 
5055   // Emit the select.
5056   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
5057              I.getOperand(3).getReg(), CondCode, MIB);
5058   I.eraseFromParent();
5059   return true;
5060 }
5061 
5062 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
5063     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
5064     MachineIRBuilder &MIRBuilder) const {
5065   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
5066          "Unexpected MachineOperand");
5067   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5068   // We want to find this sort of thing:
5069   // x = G_SUB 0, y
5070   // G_ICMP z, x
5071   //
5072   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
5073   // e.g:
5074   //
5075   // cmn z, y
5076 
5077   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
5078   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
5079   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
5080   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
5081   // Given this:
5082   //
5083   // x = G_SUB 0, y
5084   // G_ICMP x, z
5085   //
5086   // Produce this:
5087   //
5088   // cmn y, z
5089   if (isCMN(LHSDef, P, MRI))
5090     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
5091 
5092   // Same idea here, but with the RHS of the compare instead:
5093   //
5094   // Given this:
5095   //
5096   // x = G_SUB 0, y
5097   // G_ICMP z, x
5098   //
5099   // Produce this:
5100   //
5101   // cmn z, y
5102   if (isCMN(RHSDef, P, MRI))
5103     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
5104 
5105   // Given this:
5106   //
5107   // z = G_AND x, y
5108   // G_ICMP z, 0
5109   //
5110   // Produce this if the compare is signed:
5111   //
5112   // tst x, y
5113   if (!CmpInst::isUnsigned(P) && LHSDef &&
5114       LHSDef->getOpcode() == TargetOpcode::G_AND) {
5115     // Make sure that the RHS is 0.
5116     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
5117     if (!ValAndVReg || ValAndVReg->Value != 0)
5118       return nullptr;
5119 
5120     return emitTST(LHSDef->getOperand(1),
5121                    LHSDef->getOperand(2), MIRBuilder);
5122   }
5123 
5124   return nullptr;
5125 }
5126 
5127 bool AArch64InstructionSelector::selectShuffleVector(
5128     MachineInstr &I, MachineRegisterInfo &MRI) {
5129   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5130   Register Src1Reg = I.getOperand(1).getReg();
5131   const LLT Src1Ty = MRI.getType(Src1Reg);
5132   Register Src2Reg = I.getOperand(2).getReg();
5133   const LLT Src2Ty = MRI.getType(Src2Reg);
5134   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
5135 
5136   MachineBasicBlock &MBB = *I.getParent();
5137   MachineFunction &MF = *MBB.getParent();
5138   LLVMContext &Ctx = MF.getFunction().getContext();
5139 
5140   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
5141   // it's originated from a <1 x T> type. Those should have been lowered into
5142   // G_BUILD_VECTOR earlier.
5143   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
5144     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
5145     return false;
5146   }
5147 
5148   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
5149 
5150   SmallVector<Constant *, 64> CstIdxs;
5151   for (int Val : Mask) {
5152     // For now, any undef indexes we'll just assume to be 0. This should be
5153     // optimized in future, e.g. to select DUP etc.
5154     Val = Val < 0 ? 0 : Val;
5155     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
5156       unsigned Offset = Byte + Val * BytesPerElt;
5157       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
5158     }
5159   }
5160 
5161   // Use a constant pool to load the index vector for TBL.
5162   Constant *CPVal = ConstantVector::get(CstIdxs);
5163   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
5164   if (!IndexLoad) {
5165     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
5166     return false;
5167   }
5168 
5169   if (DstTy.getSizeInBits() != 128) {
5170     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
5171     // This case can be done with TBL1.
5172     MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
5173     if (!Concat) {
5174       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
5175       return false;
5176     }
5177 
5178     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
5179     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
5180                                    IndexLoad->getOperand(0).getReg(), MIB);
5181 
5182     auto TBL1 = MIB.buildInstr(
5183         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
5184         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
5185     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
5186 
5187     auto Copy =
5188         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
5189             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
5190     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
5191     I.eraseFromParent();
5192     return true;
5193   }
5194 
5195   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
5196   // Q registers for regalloc.
5197   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
5198   auto RegSeq = createQTuple(Regs, MIB);
5199   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
5200                              {RegSeq, IndexLoad->getOperand(0)});
5201   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
5202   I.eraseFromParent();
5203   return true;
5204 }
5205 
5206 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
5207     Optional<Register> DstReg, Register SrcReg, Register EltReg,
5208     unsigned LaneIdx, const RegisterBank &RB,
5209     MachineIRBuilder &MIRBuilder) const {
5210   MachineInstr *InsElt = nullptr;
5211   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5212   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
5213 
5214   // Create a register to define with the insert if one wasn't passed in.
5215   if (!DstReg)
5216     DstReg = MRI.createVirtualRegister(DstRC);
5217 
5218   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
5219   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
5220 
5221   if (RB.getID() == AArch64::FPRRegBankID) {
5222     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
5223     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5224                  .addImm(LaneIdx)
5225                  .addUse(InsSub->getOperand(0).getReg())
5226                  .addImm(0);
5227   } else {
5228     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
5229                  .addImm(LaneIdx)
5230                  .addUse(EltReg);
5231   }
5232 
5233   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
5234   return InsElt;
5235 }
5236 
5237 bool AArch64InstructionSelector::selectUSMovFromExtend(
5238     MachineInstr &MI, MachineRegisterInfo &MRI) {
5239   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
5240       MI.getOpcode() != TargetOpcode::G_ZEXT &&
5241       MI.getOpcode() != TargetOpcode::G_ANYEXT)
5242     return false;
5243   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
5244   const Register DefReg = MI.getOperand(0).getReg();
5245   const LLT DstTy = MRI.getType(DefReg);
5246   unsigned DstSize = DstTy.getSizeInBits();
5247 
5248   if (DstSize != 32 && DstSize != 64)
5249     return false;
5250 
5251   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
5252                                        MI.getOperand(1).getReg(), MRI);
5253   int64_t Lane;
5254   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
5255     return false;
5256   Register Src0 = Extract->getOperand(1).getReg();
5257 
5258   const LLT &VecTy = MRI.getType(Src0);
5259 
5260   if (VecTy.getSizeInBits() != 128) {
5261     const MachineInstr *ScalarToVector = emitScalarToVector(
5262         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
5263     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
5264     Src0 = ScalarToVector->getOperand(0).getReg();
5265   }
5266 
5267   unsigned Opcode;
5268   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
5269     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
5270   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
5271     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
5272   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
5273     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
5274   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
5275     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
5276   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
5277     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
5278   else
5279     llvm_unreachable("Unexpected type combo for S/UMov!");
5280 
5281   // We may need to generate one of these, depending on the type and sign of the
5282   // input:
5283   //  DstReg = SMOV Src0, Lane;
5284   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
5285   MachineInstr *ExtI = nullptr;
5286   if (DstSize == 64 && !IsSigned) {
5287     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
5288     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
5289     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
5290                .addImm(0)
5291                .addUse(NewReg)
5292                .addImm(AArch64::sub_32);
5293     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
5294   } else
5295     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
5296 
5297   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
5298   MI.eraseFromParent();
5299   return true;
5300 }
5301 
5302 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
5303                                                  MachineRegisterInfo &MRI) {
5304   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
5305 
5306   // Get information on the destination.
5307   Register DstReg = I.getOperand(0).getReg();
5308   const LLT DstTy = MRI.getType(DstReg);
5309   unsigned VecSize = DstTy.getSizeInBits();
5310 
5311   // Get information on the element we want to insert into the destination.
5312   Register EltReg = I.getOperand(2).getReg();
5313   const LLT EltTy = MRI.getType(EltReg);
5314   unsigned EltSize = EltTy.getSizeInBits();
5315   if (EltSize < 16 || EltSize > 64)
5316     return false; // Don't support all element types yet.
5317 
5318   // Find the definition of the index. Bail out if it's not defined by a
5319   // G_CONSTANT.
5320   Register IdxReg = I.getOperand(3).getReg();
5321   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
5322   if (!VRegAndVal)
5323     return false;
5324   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
5325 
5326   // Perform the lane insert.
5327   Register SrcReg = I.getOperand(1).getReg();
5328   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5329 
5330   if (VecSize < 128) {
5331     // If the vector we're inserting into is smaller than 128 bits, widen it
5332     // to 128 to do the insert.
5333     MachineInstr *ScalarToVec =
5334         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
5335     if (!ScalarToVec)
5336       return false;
5337     SrcReg = ScalarToVec->getOperand(0).getReg();
5338   }
5339 
5340   // Create an insert into a new FPR128 register.
5341   // Note that if our vector is already 128 bits, we end up emitting an extra
5342   // register.
5343   MachineInstr *InsMI =
5344       emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
5345 
5346   if (VecSize < 128) {
5347     // If we had to widen to perform the insert, then we have to demote back to
5348     // the original size to get the result we want.
5349     Register DemoteVec = InsMI->getOperand(0).getReg();
5350     const TargetRegisterClass *RC =
5351         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DemoteVec, MRI, TRI));
5352     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5353       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5354       return false;
5355     }
5356     unsigned SubReg = 0;
5357     if (!getSubRegForClass(RC, TRI, SubReg))
5358       return false;
5359     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5360       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5361                         << "\n");
5362       return false;
5363     }
5364     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5365         .addReg(DemoteVec, 0, SubReg);
5366     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5367   } else {
5368     // No widening needed.
5369     InsMI->getOperand(0).setReg(DstReg);
5370     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5371   }
5372 
5373   I.eraseFromParent();
5374   return true;
5375 }
5376 
5377 MachineInstr *
5378 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5379                                                MachineIRBuilder &MIRBuilder,
5380                                                MachineRegisterInfo &MRI) {
5381   LLT DstTy = MRI.getType(Dst);
5382   unsigned DstSize = DstTy.getSizeInBits();
5383   if (CV->isNullValue()) {
5384     if (DstSize == 128) {
5385       auto Mov =
5386           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5387       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5388       return &*Mov;
5389     }
5390 
5391     if (DstSize == 64) {
5392       auto Mov =
5393           MIRBuilder
5394               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5395               .addImm(0);
5396       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5397                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5398       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5399       return &*Copy;
5400     }
5401   }
5402 
5403   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5404   if (!CPLoad) {
5405     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5406     return nullptr;
5407   }
5408 
5409   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5410   RBI.constrainGenericRegister(
5411       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5412   return &*Copy;
5413 }
5414 
5415 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5416     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5417   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5418   unsigned DstSize = DstTy.getSizeInBits();
5419   assert(DstSize <= 128 && "Unexpected build_vec type!");
5420   if (DstSize < 32)
5421     return false;
5422   // Check if we're building a constant vector, in which case we want to
5423   // generate a constant pool load instead of a vector insert sequence.
5424   SmallVector<Constant *, 16> Csts;
5425   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5426     // Try to find G_CONSTANT or G_FCONSTANT
5427     auto *OpMI =
5428         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5429     if (OpMI)
5430       Csts.emplace_back(
5431           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5432     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5433                                   I.getOperand(Idx).getReg(), MRI)))
5434       Csts.emplace_back(
5435           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5436     else
5437       return false;
5438   }
5439   Constant *CV = ConstantVector::get(Csts);
5440   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5441     return false;
5442   I.eraseFromParent();
5443   return true;
5444 }
5445 
5446 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5447     MachineInstr &I, MachineRegisterInfo &MRI) {
5448   // Given:
5449   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5450   //
5451   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5452   Register Dst = I.getOperand(0).getReg();
5453   Register EltReg = I.getOperand(1).getReg();
5454   LLT EltTy = MRI.getType(EltReg);
5455   // If the index isn't on the same bank as its elements, then this can't be a
5456   // SUBREG_TO_REG.
5457   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5458   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5459   if (EltRB != DstRB)
5460     return false;
5461   if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
5462              [&MRI](const MachineOperand &Op) {
5463                return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
5464                                     MRI);
5465              }))
5466     return false;
5467   unsigned SubReg;
5468   const TargetRegisterClass *EltRC = getRegClassForTypeOnBank(EltTy, EltRB);
5469   if (!EltRC)
5470     return false;
5471   const TargetRegisterClass *DstRC =
5472       getRegClassForTypeOnBank(MRI.getType(Dst), DstRB);
5473   if (!DstRC)
5474     return false;
5475   if (!getSubRegForClass(EltRC, TRI, SubReg))
5476     return false;
5477   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5478                          .addImm(0)
5479                          .addUse(EltReg)
5480                          .addImm(SubReg);
5481   I.eraseFromParent();
5482   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5483   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5484 }
5485 
5486 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5487                                                    MachineRegisterInfo &MRI) {
5488   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5489   // Until we port more of the optimized selections, for now just use a vector
5490   // insert sequence.
5491   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5492   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5493   unsigned EltSize = EltTy.getSizeInBits();
5494 
5495   if (tryOptConstantBuildVec(I, DstTy, MRI))
5496     return true;
5497   if (tryOptBuildVecToSubregToReg(I, MRI))
5498     return true;
5499 
5500   if (EltSize < 16 || EltSize > 64)
5501     return false; // Don't support all element types yet.
5502   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5503 
5504   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5505   MachineInstr *ScalarToVec =
5506       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5507                          I.getOperand(1).getReg(), MIB);
5508   if (!ScalarToVec)
5509     return false;
5510 
5511   Register DstVec = ScalarToVec->getOperand(0).getReg();
5512   unsigned DstSize = DstTy.getSizeInBits();
5513 
5514   // Keep track of the last MI we inserted. Later on, we might be able to save
5515   // a copy using it.
5516   MachineInstr *PrevMI = nullptr;
5517   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5518     // Note that if we don't do a subregister copy, we can end up making an
5519     // extra register.
5520     PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
5521                               MIB);
5522     DstVec = PrevMI->getOperand(0).getReg();
5523   }
5524 
5525   // If DstTy's size in bits is less than 128, then emit a subregister copy
5526   // from DstVec to the last register we've defined.
5527   if (DstSize < 128) {
5528     // Force this to be FPR using the destination vector.
5529     const TargetRegisterClass *RC =
5530         getRegClassForTypeOnBank(DstTy, *RBI.getRegBank(DstVec, MRI, TRI));
5531     if (!RC)
5532       return false;
5533     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5534       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5535       return false;
5536     }
5537 
5538     unsigned SubReg = 0;
5539     if (!getSubRegForClass(RC, TRI, SubReg))
5540       return false;
5541     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5542       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5543                         << "\n");
5544       return false;
5545     }
5546 
5547     Register Reg = MRI.createVirtualRegister(RC);
5548     Register DstReg = I.getOperand(0).getReg();
5549 
5550     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5551     MachineOperand &RegOp = I.getOperand(1);
5552     RegOp.setReg(Reg);
5553     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5554   } else {
5555     // We don't need a subregister copy. Save a copy by re-using the
5556     // destination register on the final insert.
5557     assert(PrevMI && "PrevMI was null?");
5558     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5559     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5560   }
5561 
5562   I.eraseFromParent();
5563   return true;
5564 }
5565 
5566 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5567                                                            unsigned NumVecs,
5568                                                            MachineInstr &I) {
5569   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5570   assert(Opc && "Expected an opcode?");
5571   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5572   auto &MRI = *MIB.getMRI();
5573   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5574   unsigned Size = Ty.getSizeInBits();
5575   assert((Size == 64 || Size == 128) &&
5576          "Destination must be 64 bits or 128 bits?");
5577   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5578   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5579   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5580   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5581   Load.cloneMemRefs(I);
5582   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5583   Register SelectedLoadDst = Load->getOperand(0).getReg();
5584   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5585     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5586                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5587     // Emit the subreg copies and immediately select them.
5588     // FIXME: We should refactor our copy code into an emitCopy helper and
5589     // clean up uses of this pattern elsewhere in the selector.
5590     selectCopy(*Vec, TII, MRI, TRI, RBI);
5591   }
5592   return true;
5593 }
5594 
5595 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5596     MachineInstr &I, MachineRegisterInfo &MRI) {
5597   // Find the intrinsic ID.
5598   unsigned IntrinID = I.getIntrinsicID();
5599 
5600   const LLT S8 = LLT::scalar(8);
5601   const LLT S16 = LLT::scalar(16);
5602   const LLT S32 = LLT::scalar(32);
5603   const LLT S64 = LLT::scalar(64);
5604   const LLT P0 = LLT::pointer(0, 64);
5605   // Select the instruction.
5606   switch (IntrinID) {
5607   default:
5608     return false;
5609   case Intrinsic::aarch64_ldxp:
5610   case Intrinsic::aarch64_ldaxp: {
5611     auto NewI = MIB.buildInstr(
5612         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5613         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5614         {I.getOperand(3)});
5615     NewI.cloneMemRefs(I);
5616     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5617     break;
5618   }
5619   case Intrinsic::trap:
5620     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5621     break;
5622   case Intrinsic::debugtrap:
5623     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5624     break;
5625   case Intrinsic::ubsantrap:
5626     MIB.buildInstr(AArch64::BRK, {}, {})
5627         .addImm(I.getOperand(1).getImm() | ('U' << 8));
5628     break;
5629   case Intrinsic::aarch64_neon_ld2: {
5630     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5631     unsigned Opc = 0;
5632     if (Ty == LLT::fixed_vector(8, S8))
5633       Opc = AArch64::LD2Twov8b;
5634     else if (Ty == LLT::fixed_vector(16, S8))
5635       Opc = AArch64::LD2Twov16b;
5636     else if (Ty == LLT::fixed_vector(4, S16))
5637       Opc = AArch64::LD2Twov4h;
5638     else if (Ty == LLT::fixed_vector(8, S16))
5639       Opc = AArch64::LD2Twov8h;
5640     else if (Ty == LLT::fixed_vector(2, S32))
5641       Opc = AArch64::LD2Twov2s;
5642     else if (Ty == LLT::fixed_vector(4, S32))
5643       Opc = AArch64::LD2Twov4s;
5644     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5645       Opc = AArch64::LD2Twov2d;
5646     else if (Ty == S64 || Ty == P0)
5647       Opc = AArch64::LD1Twov1d;
5648     else
5649       llvm_unreachable("Unexpected type for ld2!");
5650     selectVectorLoadIntrinsic(Opc, 2, I);
5651     break;
5652   }
5653   case Intrinsic::aarch64_neon_ld4: {
5654     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5655     unsigned Opc = 0;
5656     if (Ty == LLT::fixed_vector(8, S8))
5657       Opc = AArch64::LD4Fourv8b;
5658     else if (Ty == LLT::fixed_vector(16, S8))
5659       Opc = AArch64::LD4Fourv16b;
5660     else if (Ty == LLT::fixed_vector(4, S16))
5661       Opc = AArch64::LD4Fourv4h;
5662     else if (Ty == LLT::fixed_vector(8, S16))
5663       Opc = AArch64::LD4Fourv8h;
5664     else if (Ty == LLT::fixed_vector(2, S32))
5665       Opc = AArch64::LD4Fourv2s;
5666     else if (Ty == LLT::fixed_vector(4, S32))
5667       Opc = AArch64::LD4Fourv4s;
5668     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5669       Opc = AArch64::LD4Fourv2d;
5670     else if (Ty == S64 || Ty == P0)
5671       Opc = AArch64::LD1Fourv1d;
5672     else
5673       llvm_unreachable("Unexpected type for ld4!");
5674     selectVectorLoadIntrinsic(Opc, 4, I);
5675     break;
5676   }
5677   case Intrinsic::aarch64_neon_st2: {
5678     Register Src1 = I.getOperand(1).getReg();
5679     Register Src2 = I.getOperand(2).getReg();
5680     Register Ptr = I.getOperand(3).getReg();
5681     LLT Ty = MRI.getType(Src1);
5682     unsigned Opc;
5683     if (Ty == LLT::fixed_vector(8, S8))
5684       Opc = AArch64::ST2Twov8b;
5685     else if (Ty == LLT::fixed_vector(16, S8))
5686       Opc = AArch64::ST2Twov16b;
5687     else if (Ty == LLT::fixed_vector(4, S16))
5688       Opc = AArch64::ST2Twov4h;
5689     else if (Ty == LLT::fixed_vector(8, S16))
5690       Opc = AArch64::ST2Twov8h;
5691     else if (Ty == LLT::fixed_vector(2, S32))
5692       Opc = AArch64::ST2Twov2s;
5693     else if (Ty == LLT::fixed_vector(4, S32))
5694       Opc = AArch64::ST2Twov4s;
5695     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5696       Opc = AArch64::ST2Twov2d;
5697     else if (Ty == S64 || Ty == P0)
5698       Opc = AArch64::ST1Twov1d;
5699     else
5700       llvm_unreachable("Unexpected type for st2!");
5701     SmallVector<Register, 2> Regs = {Src1, Src2};
5702     Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5703                                                : createDTuple(Regs, MIB);
5704     auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5705     Store.cloneMemRefs(I);
5706     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5707     break;
5708   }
5709   case Intrinsic::aarch64_mops_memset_tag: {
5710     // Transform
5711     //    %dst:gpr(p0) = \
5712     //      G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.mops.memset.tag),
5713     //      \ %dst:gpr(p0), %val:gpr(s64), %n:gpr(s64)
5714     // where %dst is updated, into
5715     //    %Rd:GPR64common, %Rn:GPR64) = \
5716     //      MOPSMemorySetTaggingPseudo \
5717     //      %Rd:GPR64common, %Rn:GPR64, %Rm:GPR64
5718     // where Rd and Rn are tied.
5719     // It is expected that %val has been extended to s64 in legalization.
5720     // Note that the order of the size/value operands are swapped.
5721 
5722     Register DstDef = I.getOperand(0).getReg();
5723     // I.getOperand(1) is the intrinsic function
5724     Register DstUse = I.getOperand(2).getReg();
5725     Register ValUse = I.getOperand(3).getReg();
5726     Register SizeUse = I.getOperand(4).getReg();
5727 
5728     // MOPSMemorySetTaggingPseudo has two defs; the intrinsic call has only one.
5729     // Therefore an additional virtual register is requried for the updated size
5730     // operand. This value is not accessible via the semantics of the intrinsic.
5731     Register SizeDef = MRI.createGenericVirtualRegister(LLT::scalar(64));
5732 
5733     auto Memset = MIB.buildInstr(AArch64::MOPSMemorySetTaggingPseudo,
5734                                  {DstDef, SizeDef}, {DstUse, SizeUse, ValUse});
5735     Memset.cloneMemRefs(I);
5736     constrainSelectedInstRegOperands(*Memset, TII, TRI, RBI);
5737     break;
5738   }
5739   }
5740 
5741   I.eraseFromParent();
5742   return true;
5743 }
5744 
5745 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5746                                                  MachineRegisterInfo &MRI) {
5747   unsigned IntrinID = I.getIntrinsicID();
5748 
5749   switch (IntrinID) {
5750   default:
5751     break;
5752   case Intrinsic::aarch64_crypto_sha1h: {
5753     Register DstReg = I.getOperand(0).getReg();
5754     Register SrcReg = I.getOperand(2).getReg();
5755 
5756     // FIXME: Should this be an assert?
5757     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5758         MRI.getType(SrcReg).getSizeInBits() != 32)
5759       return false;
5760 
5761     // The operation has to happen on FPRs. Set up some new FPR registers for
5762     // the source and destination if they are on GPRs.
5763     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5764       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5765       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5766 
5767       // Make sure the copy ends up getting constrained properly.
5768       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5769                                    AArch64::GPR32RegClass, MRI);
5770     }
5771 
5772     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5773       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5774 
5775     // Actually insert the instruction.
5776     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5777     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5778 
5779     // Did we create a new register for the destination?
5780     if (DstReg != I.getOperand(0).getReg()) {
5781       // Yep. Copy the result of the instruction back into the original
5782       // destination.
5783       MIB.buildCopy({I.getOperand(0)}, {DstReg});
5784       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5785                                    AArch64::GPR32RegClass, MRI);
5786     }
5787 
5788     I.eraseFromParent();
5789     return true;
5790   }
5791   case Intrinsic::ptrauth_sign: {
5792     Register DstReg = I.getOperand(0).getReg();
5793     Register ValReg = I.getOperand(2).getReg();
5794     uint64_t Key = I.getOperand(3).getImm();
5795     Register DiscReg = I.getOperand(4).getReg();
5796     auto DiscVal = getIConstantVRegVal(DiscReg, MRI);
5797     bool IsDiscZero = DiscVal && DiscVal->isNullValue();
5798 
5799     if (Key > 3)
5800       return false;
5801 
5802     unsigned Opcodes[][4] = {
5803         {AArch64::PACIA, AArch64::PACIB, AArch64::PACDA, AArch64::PACDB},
5804         {AArch64::PACIZA, AArch64::PACIZB, AArch64::PACDZA, AArch64::PACDZB}};
5805     unsigned Opcode = Opcodes[IsDiscZero][Key];
5806 
5807     auto PAC = MIB.buildInstr(Opcode, {DstReg}, {ValReg});
5808 
5809     if (!IsDiscZero) {
5810       PAC.addUse(DiscReg);
5811       RBI.constrainGenericRegister(DiscReg, AArch64::GPR64spRegClass, MRI);
5812     }
5813 
5814     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5815     I.eraseFromParent();
5816     return true;
5817   }
5818   case Intrinsic::frameaddress:
5819   case Intrinsic::returnaddress: {
5820     MachineFunction &MF = *I.getParent()->getParent();
5821     MachineFrameInfo &MFI = MF.getFrameInfo();
5822 
5823     unsigned Depth = I.getOperand(2).getImm();
5824     Register DstReg = I.getOperand(0).getReg();
5825     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5826 
5827     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5828       if (!MFReturnAddr) {
5829         // Insert the copy from LR/X30 into the entry block, before it can be
5830         // clobbered by anything.
5831         MFI.setReturnAddressIsTaken(true);
5832         MFReturnAddr = getFunctionLiveInPhysReg(
5833             MF, TII, AArch64::LR, AArch64::GPR64RegClass, I.getDebugLoc());
5834       }
5835 
5836       if (STI.hasPAuth()) {
5837         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5838       } else {
5839         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5840         MIB.buildInstr(AArch64::XPACLRI);
5841         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5842       }
5843 
5844       I.eraseFromParent();
5845       return true;
5846     }
5847 
5848     MFI.setFrameAddressIsTaken(true);
5849     Register FrameAddr(AArch64::FP);
5850     while (Depth--) {
5851       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5852       auto Ldr =
5853           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5854       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5855       FrameAddr = NextFrame;
5856     }
5857 
5858     if (IntrinID == Intrinsic::frameaddress)
5859       MIB.buildCopy({DstReg}, {FrameAddr});
5860     else {
5861       MFI.setReturnAddressIsTaken(true);
5862 
5863       if (STI.hasPAuth()) {
5864         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5865         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5866         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5867       } else {
5868         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5869             .addImm(1);
5870         MIB.buildInstr(AArch64::XPACLRI);
5871         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5872       }
5873     }
5874 
5875     I.eraseFromParent();
5876     return true;
5877   }
5878   case Intrinsic::swift_async_context_addr:
5879     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5880                               {Register(AArch64::FP)})
5881                    .addImm(8)
5882                    .addImm(0);
5883     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5884 
5885     MF->getFrameInfo().setFrameAddressIsTaken(true);
5886     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5887     I.eraseFromParent();
5888     return true;
5889   }
5890   return false;
5891 }
5892 
5893 InstructionSelector::ComplexRendererFns
5894 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5895   auto MaybeImmed = getImmedFromMO(Root);
5896   if (MaybeImmed == None || *MaybeImmed > 31)
5897     return None;
5898   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5899   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5900 }
5901 
5902 InstructionSelector::ComplexRendererFns
5903 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5904   auto MaybeImmed = getImmedFromMO(Root);
5905   if (MaybeImmed == None || *MaybeImmed > 31)
5906     return None;
5907   uint64_t Enc = 31 - *MaybeImmed;
5908   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5909 }
5910 
5911 InstructionSelector::ComplexRendererFns
5912 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5913   auto MaybeImmed = getImmedFromMO(Root);
5914   if (MaybeImmed == None || *MaybeImmed > 63)
5915     return None;
5916   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5917   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5918 }
5919 
5920 InstructionSelector::ComplexRendererFns
5921 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5922   auto MaybeImmed = getImmedFromMO(Root);
5923   if (MaybeImmed == None || *MaybeImmed > 63)
5924     return None;
5925   uint64_t Enc = 63 - *MaybeImmed;
5926   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5927 }
5928 
5929 /// Helper to select an immediate value that can be represented as a 12-bit
5930 /// value shifted left by either 0 or 12. If it is possible to do so, return
5931 /// the immediate and shift value. If not, return None.
5932 ///
5933 /// Used by selectArithImmed and selectNegArithImmed.
5934 InstructionSelector::ComplexRendererFns
5935 AArch64InstructionSelector::select12BitValueWithLeftShift(
5936     uint64_t Immed) const {
5937   unsigned ShiftAmt;
5938   if (Immed >> 12 == 0) {
5939     ShiftAmt = 0;
5940   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5941     ShiftAmt = 12;
5942     Immed = Immed >> 12;
5943   } else
5944     return None;
5945 
5946   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5947   return {{
5948       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5949       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5950   }};
5951 }
5952 
5953 /// SelectArithImmed - Select an immediate value that can be represented as
5954 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
5955 /// Val set to the 12-bit value and Shift set to the shifter operand.
5956 InstructionSelector::ComplexRendererFns
5957 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5958   // This function is called from the addsub_shifted_imm ComplexPattern,
5959   // which lists [imm] as the list of opcode it's interested in, however
5960   // we still need to check whether the operand is actually an immediate
5961   // here because the ComplexPattern opcode list is only used in
5962   // root-level opcode matching.
5963   auto MaybeImmed = getImmedFromMO(Root);
5964   if (MaybeImmed == None)
5965     return None;
5966   return select12BitValueWithLeftShift(*MaybeImmed);
5967 }
5968 
5969 /// SelectNegArithImmed - As above, but negates the value before trying to
5970 /// select it.
5971 InstructionSelector::ComplexRendererFns
5972 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5973   // We need a register here, because we need to know if we have a 64 or 32
5974   // bit immediate.
5975   if (!Root.isReg())
5976     return None;
5977   auto MaybeImmed = getImmedFromMO(Root);
5978   if (MaybeImmed == None)
5979     return None;
5980   uint64_t Immed = *MaybeImmed;
5981 
5982   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5983   // have the opposite effect on the C flag, so this pattern mustn't match under
5984   // those circumstances.
5985   if (Immed == 0)
5986     return None;
5987 
5988   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5989   // the root.
5990   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5991   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5992     Immed = ~((uint32_t)Immed) + 1;
5993   else
5994     Immed = ~Immed + 1ULL;
5995 
5996   if (Immed & 0xFFFFFFFFFF000000ULL)
5997     return None;
5998 
5999   Immed &= 0xFFFFFFULL;
6000   return select12BitValueWithLeftShift(Immed);
6001 }
6002 
6003 /// Return true if it is worth folding MI into an extended register. That is,
6004 /// if it's safe to pull it into the addressing mode of a load or store as a
6005 /// shift.
6006 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
6007     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
6008   // Always fold if there is one use, or if we're optimizing for size.
6009   Register DefReg = MI.getOperand(0).getReg();
6010   if (MRI.hasOneNonDBGUse(DefReg) ||
6011       MI.getParent()->getParent()->getFunction().hasOptSize())
6012     return true;
6013 
6014   // It's better to avoid folding and recomputing shifts when we don't have a
6015   // fastpath.
6016   if (!STI.hasLSLFast())
6017     return false;
6018 
6019   // We have a fastpath, so folding a shift in and potentially computing it
6020   // many times may be beneficial. Check if this is only used in memory ops.
6021   // If it is, then we should fold.
6022   return all_of(MRI.use_nodbg_instructions(DefReg),
6023                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
6024 }
6025 
6026 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
6027   switch (Type) {
6028   case AArch64_AM::SXTB:
6029   case AArch64_AM::SXTH:
6030   case AArch64_AM::SXTW:
6031     return true;
6032   default:
6033     return false;
6034   }
6035 }
6036 
6037 InstructionSelector::ComplexRendererFns
6038 AArch64InstructionSelector::selectExtendedSHL(
6039     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
6040     unsigned SizeInBytes, bool WantsExt) const {
6041   assert(Base.isReg() && "Expected base to be a register operand");
6042   assert(Offset.isReg() && "Expected offset to be a register operand");
6043 
6044   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6045   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
6046 
6047   unsigned OffsetOpc = OffsetInst->getOpcode();
6048   bool LookedThroughZExt = false;
6049   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
6050     // Try to look through a ZEXT.
6051     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
6052       return None;
6053 
6054     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
6055     OffsetOpc = OffsetInst->getOpcode();
6056     LookedThroughZExt = true;
6057 
6058     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
6059       return None;
6060   }
6061   // Make sure that the memory op is a valid size.
6062   int64_t LegalShiftVal = Log2_32(SizeInBytes);
6063   if (LegalShiftVal == 0)
6064     return None;
6065   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6066     return None;
6067 
6068   // Now, try to find the specific G_CONSTANT. Start by assuming that the
6069   // register we will offset is the LHS, and the register containing the
6070   // constant is the RHS.
6071   Register OffsetReg = OffsetInst->getOperand(1).getReg();
6072   Register ConstantReg = OffsetInst->getOperand(2).getReg();
6073   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6074   if (!ValAndVReg) {
6075     // We didn't get a constant on the RHS. If the opcode is a shift, then
6076     // we're done.
6077     if (OffsetOpc == TargetOpcode::G_SHL)
6078       return None;
6079 
6080     // If we have a G_MUL, we can use either register. Try looking at the RHS.
6081     std::swap(OffsetReg, ConstantReg);
6082     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
6083     if (!ValAndVReg)
6084       return None;
6085   }
6086 
6087   // The value must fit into 3 bits, and must be positive. Make sure that is
6088   // true.
6089   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
6090 
6091   // Since we're going to pull this into a shift, the constant value must be
6092   // a power of 2. If we got a multiply, then we need to check this.
6093   if (OffsetOpc == TargetOpcode::G_MUL) {
6094     if (!isPowerOf2_32(ImmVal))
6095       return None;
6096 
6097     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
6098     ImmVal = Log2_32(ImmVal);
6099   }
6100 
6101   if ((ImmVal & 0x7) != ImmVal)
6102     return None;
6103 
6104   // We are only allowed to shift by LegalShiftVal. This shift value is built
6105   // into the instruction, so we can't just use whatever we want.
6106   if (ImmVal != LegalShiftVal)
6107     return None;
6108 
6109   unsigned SignExtend = 0;
6110   if (WantsExt) {
6111     // Check if the offset is defined by an extend, unless we looked through a
6112     // G_ZEXT earlier.
6113     if (!LookedThroughZExt) {
6114       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
6115       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
6116       if (Ext == AArch64_AM::InvalidShiftExtend)
6117         return None;
6118 
6119       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
6120       // We only support SXTW for signed extension here.
6121       if (SignExtend && Ext != AArch64_AM::SXTW)
6122         return None;
6123       OffsetReg = ExtInst->getOperand(1).getReg();
6124     }
6125 
6126     // Need a 32-bit wide register here.
6127     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
6128     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
6129   }
6130 
6131   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
6132   // offset. Signify that we are shifting by setting the shift flag to 1.
6133   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
6134            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
6135            [=](MachineInstrBuilder &MIB) {
6136              // Need to add both immediates here to make sure that they are both
6137              // added to the instruction.
6138              MIB.addImm(SignExtend);
6139              MIB.addImm(1);
6140            }}};
6141 }
6142 
6143 /// This is used for computing addresses like this:
6144 ///
6145 /// ldr x1, [x2, x3, lsl #3]
6146 ///
6147 /// Where x2 is the base register, and x3 is an offset register. The shift-left
6148 /// is a constant value specific to this load instruction. That is, we'll never
6149 /// see anything other than a 3 here (which corresponds to the size of the
6150 /// element being loaded.)
6151 InstructionSelector::ComplexRendererFns
6152 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
6153     MachineOperand &Root, unsigned SizeInBytes) const {
6154   if (!Root.isReg())
6155     return None;
6156   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6157 
6158   // We want to find something like this:
6159   //
6160   // val = G_CONSTANT LegalShiftVal
6161   // shift = G_SHL off_reg val
6162   // ptr = G_PTR_ADD base_reg shift
6163   // x = G_LOAD ptr
6164   //
6165   // And fold it into this addressing mode:
6166   //
6167   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
6168 
6169   // Check if we can find the G_PTR_ADD.
6170   MachineInstr *PtrAdd =
6171       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6172   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6173     return None;
6174 
6175   // Now, try to match an opcode which will match our specific offset.
6176   // We want a G_SHL or a G_MUL.
6177   MachineInstr *OffsetInst =
6178       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
6179   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
6180                            OffsetInst->getOperand(0), SizeInBytes,
6181                            /*WantsExt=*/false);
6182 }
6183 
6184 /// This is used for computing addresses like this:
6185 ///
6186 /// ldr x1, [x2, x3]
6187 ///
6188 /// Where x2 is the base register, and x3 is an offset register.
6189 ///
6190 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
6191 /// this will do so. Otherwise, it will return None.
6192 InstructionSelector::ComplexRendererFns
6193 AArch64InstructionSelector::selectAddrModeRegisterOffset(
6194     MachineOperand &Root) const {
6195   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6196 
6197   // We need a GEP.
6198   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
6199   if (Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
6200     return None;
6201 
6202   // If this is used more than once, let's not bother folding.
6203   // TODO: Check if they are memory ops. If they are, then we can still fold
6204   // without having to recompute anything.
6205   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
6206     return None;
6207 
6208   // Base is the GEP's LHS, offset is its RHS.
6209   return {{[=](MachineInstrBuilder &MIB) {
6210              MIB.addUse(Gep->getOperand(1).getReg());
6211            },
6212            [=](MachineInstrBuilder &MIB) {
6213              MIB.addUse(Gep->getOperand(2).getReg());
6214            },
6215            [=](MachineInstrBuilder &MIB) {
6216              // Need to add both immediates here to make sure that they are both
6217              // added to the instruction.
6218              MIB.addImm(0);
6219              MIB.addImm(0);
6220            }}};
6221 }
6222 
6223 /// This is intended to be equivalent to selectAddrModeXRO in
6224 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
6225 InstructionSelector::ComplexRendererFns
6226 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
6227                                               unsigned SizeInBytes) const {
6228   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6229   if (!Root.isReg())
6230     return None;
6231   MachineInstr *PtrAdd =
6232       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6233   if (!PtrAdd)
6234     return None;
6235 
6236   // Check for an immediates which cannot be encoded in the [base + imm]
6237   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
6238   // end up with code like:
6239   //
6240   // mov x0, wide
6241   // add x1 base, x0
6242   // ldr x2, [x1, x0]
6243   //
6244   // In this situation, we can use the [base, xreg] addressing mode to save an
6245   // add/sub:
6246   //
6247   // mov x0, wide
6248   // ldr x2, [base, x0]
6249   auto ValAndVReg =
6250       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
6251   if (ValAndVReg) {
6252     unsigned Scale = Log2_32(SizeInBytes);
6253     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
6254 
6255     // Skip immediates that can be selected in the load/store addresing
6256     // mode.
6257     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
6258         ImmOff < (0x1000 << Scale))
6259       return None;
6260 
6261     // Helper lambda to decide whether or not it is preferable to emit an add.
6262     auto isPreferredADD = [](int64_t ImmOff) {
6263       // Constants in [0x0, 0xfff] can be encoded in an add.
6264       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
6265         return true;
6266 
6267       // Can it be encoded in an add lsl #12?
6268       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
6269         return false;
6270 
6271       // It can be encoded in an add lsl #12, but we may not want to. If it is
6272       // possible to select this as a single movz, then prefer that. A single
6273       // movz is faster than an add with a shift.
6274       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
6275              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
6276     };
6277 
6278     // If the immediate can be encoded in a single add/sub, then bail out.
6279     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
6280       return None;
6281   }
6282 
6283   // Try to fold shifts into the addressing mode.
6284   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
6285   if (AddrModeFns)
6286     return AddrModeFns;
6287 
6288   // If that doesn't work, see if it's possible to fold in registers from
6289   // a GEP.
6290   return selectAddrModeRegisterOffset(Root);
6291 }
6292 
6293 /// This is used for computing addresses like this:
6294 ///
6295 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
6296 ///
6297 /// Where we have a 64-bit base register, a 32-bit offset register, and an
6298 /// extend (which may or may not be signed).
6299 InstructionSelector::ComplexRendererFns
6300 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
6301                                               unsigned SizeInBytes) const {
6302   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
6303 
6304   MachineInstr *PtrAdd =
6305       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
6306   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
6307     return None;
6308 
6309   MachineOperand &LHS = PtrAdd->getOperand(1);
6310   MachineOperand &RHS = PtrAdd->getOperand(2);
6311   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
6312 
6313   // The first case is the same as selectAddrModeXRO, except we need an extend.
6314   // In this case, we try to find a shift and extend, and fold them into the
6315   // addressing mode.
6316   //
6317   // E.g.
6318   //
6319   // off_reg = G_Z/S/ANYEXT ext_reg
6320   // val = G_CONSTANT LegalShiftVal
6321   // shift = G_SHL off_reg val
6322   // ptr = G_PTR_ADD base_reg shift
6323   // x = G_LOAD ptr
6324   //
6325   // In this case we can get a load like this:
6326   //
6327   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
6328   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
6329                                        SizeInBytes, /*WantsExt=*/true);
6330   if (ExtendedShl)
6331     return ExtendedShl;
6332 
6333   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
6334   //
6335   // e.g.
6336   // ldr something, [base_reg, ext_reg, sxtw]
6337   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
6338     return None;
6339 
6340   // Check if this is an extend. We'll get an extend type if it is.
6341   AArch64_AM::ShiftExtendType Ext =
6342       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
6343   if (Ext == AArch64_AM::InvalidShiftExtend)
6344     return None;
6345 
6346   // Need a 32-bit wide register.
6347   MachineIRBuilder MIB(*PtrAdd);
6348   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
6349                                        AArch64::GPR32RegClass, MIB);
6350   unsigned SignExtend = Ext == AArch64_AM::SXTW;
6351 
6352   // Base is LHS, offset is ExtReg.
6353   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
6354            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6355            [=](MachineInstrBuilder &MIB) {
6356              MIB.addImm(SignExtend);
6357              MIB.addImm(0);
6358            }}};
6359 }
6360 
6361 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
6362 /// should only match when there is an offset that is not valid for a scaled
6363 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
6364 /// memory reference, which is needed here to know what is valid for a scaled
6365 /// immediate.
6366 InstructionSelector::ComplexRendererFns
6367 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
6368                                                    unsigned Size) const {
6369   MachineRegisterInfo &MRI =
6370       Root.getParent()->getParent()->getParent()->getRegInfo();
6371 
6372   if (!Root.isReg())
6373     return None;
6374 
6375   if (!isBaseWithConstantOffset(Root, MRI))
6376     return None;
6377 
6378   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6379 
6380   MachineOperand &OffImm = RootDef->getOperand(2);
6381   if (!OffImm.isReg())
6382     return None;
6383   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
6384   if (RHS->getOpcode() != TargetOpcode::G_CONSTANT)
6385     return None;
6386   int64_t RHSC;
6387   MachineOperand &RHSOp1 = RHS->getOperand(1);
6388   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
6389     return None;
6390   RHSC = RHSOp1.getCImm()->getSExtValue();
6391 
6392   // If the offset is valid as a scaled immediate, don't match here.
6393   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
6394     return None;
6395   if (RHSC >= -256 && RHSC < 256) {
6396     MachineOperand &Base = RootDef->getOperand(1);
6397     return {{
6398         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
6399         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
6400     }};
6401   }
6402   return None;
6403 }
6404 
6405 InstructionSelector::ComplexRendererFns
6406 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
6407                                                  unsigned Size,
6408                                                  MachineRegisterInfo &MRI) const {
6409   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
6410     return None;
6411   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
6412   if (Adrp.getOpcode() != AArch64::ADRP)
6413     return None;
6414 
6415   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
6416   auto Offset = Adrp.getOperand(1).getOffset();
6417   if (Offset % Size != 0)
6418     return None;
6419 
6420   auto GV = Adrp.getOperand(1).getGlobal();
6421   if (GV->isThreadLocal())
6422     return None;
6423 
6424   auto &MF = *RootDef.getParent()->getParent();
6425   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
6426     return None;
6427 
6428   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
6429   MachineIRBuilder MIRBuilder(RootDef);
6430   Register AdrpReg = Adrp.getOperand(0).getReg();
6431   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
6432            [=](MachineInstrBuilder &MIB) {
6433              MIB.addGlobalAddress(GV, Offset,
6434                                   OpFlags | AArch64II::MO_PAGEOFF |
6435                                       AArch64II::MO_NC);
6436            }}};
6437 }
6438 
6439 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
6440 /// "Size" argument is the size in bytes of the memory reference, which
6441 /// determines the scale.
6442 InstructionSelector::ComplexRendererFns
6443 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
6444                                                   unsigned Size) const {
6445   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
6446   MachineRegisterInfo &MRI = MF.getRegInfo();
6447 
6448   if (!Root.isReg())
6449     return None;
6450 
6451   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6452   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
6453     return {{
6454         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
6455         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6456     }};
6457   }
6458 
6459   CodeModel::Model CM = MF.getTarget().getCodeModel();
6460   // Check if we can fold in the ADD of small code model ADRP + ADD address.
6461   if (CM == CodeModel::Small) {
6462     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
6463     if (OpFns)
6464       return OpFns;
6465   }
6466 
6467   if (isBaseWithConstantOffset(Root, MRI)) {
6468     MachineOperand &LHS = RootDef->getOperand(1);
6469     MachineOperand &RHS = RootDef->getOperand(2);
6470     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
6471     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
6472 
6473     int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
6474     unsigned Scale = Log2_32(Size);
6475     if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
6476       if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
6477         return {{
6478             [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
6479             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6480         }};
6481 
6482       return {{
6483           [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
6484           [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6485       }};
6486     }
6487   }
6488 
6489   // Before falling back to our general case, check if the unscaled
6490   // instructions can handle this. If so, that's preferable.
6491   if (selectAddrModeUnscaled(Root, Size))
6492     return None;
6493 
6494   return {{
6495       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
6496       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6497   }};
6498 }
6499 
6500 /// Given a shift instruction, return the correct shift type for that
6501 /// instruction.
6502 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
6503   switch (MI.getOpcode()) {
6504   default:
6505     return AArch64_AM::InvalidShiftExtend;
6506   case TargetOpcode::G_SHL:
6507     return AArch64_AM::LSL;
6508   case TargetOpcode::G_LSHR:
6509     return AArch64_AM::LSR;
6510   case TargetOpcode::G_ASHR:
6511     return AArch64_AM::ASR;
6512   case TargetOpcode::G_ROTR:
6513     return AArch64_AM::ROR;
6514   }
6515 }
6516 
6517 /// Select a "shifted register" operand. If the value is not shifted, set the
6518 /// shift operand to a default value of "lsl 0".
6519 InstructionSelector::ComplexRendererFns
6520 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
6521                                                   bool AllowROR) const {
6522   if (!Root.isReg())
6523     return None;
6524   MachineRegisterInfo &MRI =
6525       Root.getParent()->getParent()->getParent()->getRegInfo();
6526 
6527   // Check if the operand is defined by an instruction which corresponds to
6528   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
6529   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
6530   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
6531   if (ShType == AArch64_AM::InvalidShiftExtend)
6532     return None;
6533   if (ShType == AArch64_AM::ROR && !AllowROR)
6534     return None;
6535   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
6536     return None;
6537 
6538   // Need an immediate on the RHS.
6539   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
6540   auto Immed = getImmedFromMO(ShiftRHS);
6541   if (!Immed)
6542     return None;
6543 
6544   // We have something that we can fold. Fold in the shift's LHS and RHS into
6545   // the instruction.
6546   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
6547   Register ShiftReg = ShiftLHS.getReg();
6548 
6549   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
6550   unsigned Val = *Immed & (NumBits - 1);
6551   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
6552 
6553   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
6554            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
6555 }
6556 
6557 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
6558     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
6559   unsigned Opc = MI.getOpcode();
6560 
6561   // Handle explicit extend instructions first.
6562   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
6563     unsigned Size;
6564     if (Opc == TargetOpcode::G_SEXT)
6565       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6566     else
6567       Size = MI.getOperand(2).getImm();
6568     assert(Size != 64 && "Extend from 64 bits?");
6569     switch (Size) {
6570     case 8:
6571       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
6572     case 16:
6573       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
6574     case 32:
6575       return AArch64_AM::SXTW;
6576     default:
6577       return AArch64_AM::InvalidShiftExtend;
6578     }
6579   }
6580 
6581   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
6582     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6583     assert(Size != 64 && "Extend from 64 bits?");
6584     switch (Size) {
6585     case 8:
6586       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
6587     case 16:
6588       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
6589     case 32:
6590       return AArch64_AM::UXTW;
6591     default:
6592       return AArch64_AM::InvalidShiftExtend;
6593     }
6594   }
6595 
6596   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
6597   // on the RHS.
6598   if (Opc != TargetOpcode::G_AND)
6599     return AArch64_AM::InvalidShiftExtend;
6600 
6601   Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
6602   if (!MaybeAndMask)
6603     return AArch64_AM::InvalidShiftExtend;
6604   uint64_t AndMask = *MaybeAndMask;
6605   switch (AndMask) {
6606   default:
6607     return AArch64_AM::InvalidShiftExtend;
6608   case 0xFF:
6609     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
6610   case 0xFFFF:
6611     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
6612   case 0xFFFFFFFF:
6613     return AArch64_AM::UXTW;
6614   }
6615 }
6616 
6617 Register AArch64InstructionSelector::moveScalarRegClass(
6618     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
6619   MachineRegisterInfo &MRI = *MIB.getMRI();
6620   auto Ty = MRI.getType(Reg);
6621   assert(!Ty.isVector() && "Expected scalars only!");
6622   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
6623     return Reg;
6624 
6625   // Create a copy and immediately select it.
6626   // FIXME: We should have an emitCopy function?
6627   auto Copy = MIB.buildCopy({&RC}, {Reg});
6628   selectCopy(*Copy, TII, MRI, TRI, RBI);
6629   return Copy.getReg(0);
6630 }
6631 
6632 /// Select an "extended register" operand. This operand folds in an extend
6633 /// followed by an optional left shift.
6634 InstructionSelector::ComplexRendererFns
6635 AArch64InstructionSelector::selectArithExtendedRegister(
6636     MachineOperand &Root) const {
6637   if (!Root.isReg())
6638     return None;
6639   MachineRegisterInfo &MRI =
6640       Root.getParent()->getParent()->getParent()->getRegInfo();
6641 
6642   uint64_t ShiftVal = 0;
6643   Register ExtReg;
6644   AArch64_AM::ShiftExtendType Ext;
6645   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
6646   if (!RootDef)
6647     return None;
6648 
6649   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
6650     return None;
6651 
6652   // Check if we can fold a shift and an extend.
6653   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6654     // Look for a constant on the RHS of the shift.
6655     MachineOperand &RHS = RootDef->getOperand(2);
6656     Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6657     if (!MaybeShiftVal)
6658       return None;
6659     ShiftVal = *MaybeShiftVal;
6660     if (ShiftVal > 4)
6661       return None;
6662     // Look for a valid extend instruction on the LHS of the shift.
6663     MachineOperand &LHS = RootDef->getOperand(1);
6664     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6665     if (!ExtDef)
6666       return None;
6667     Ext = getExtendTypeForInst(*ExtDef, MRI);
6668     if (Ext == AArch64_AM::InvalidShiftExtend)
6669       return None;
6670     ExtReg = ExtDef->getOperand(1).getReg();
6671   } else {
6672     // Didn't get a shift. Try just folding an extend.
6673     Ext = getExtendTypeForInst(*RootDef, MRI);
6674     if (Ext == AArch64_AM::InvalidShiftExtend)
6675       return None;
6676     ExtReg = RootDef->getOperand(1).getReg();
6677 
6678     // If we have a 32 bit instruction which zeroes out the high half of a
6679     // register, we get an implicit zero extend for free. Check if we have one.
6680     // FIXME: We actually emit the extend right now even though we don't have
6681     // to.
6682     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6683       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6684       if (isDef32(*ExtInst))
6685         return None;
6686     }
6687   }
6688 
6689   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6690   // copy.
6691   MachineIRBuilder MIB(*RootDef);
6692   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6693 
6694   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6695            [=](MachineInstrBuilder &MIB) {
6696              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6697            }}};
6698 }
6699 
6700 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6701                                                 const MachineInstr &MI,
6702                                                 int OpIdx) const {
6703   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6704   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6705          "Expected G_CONSTANT");
6706   Optional<int64_t> CstVal =
6707       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6708   assert(CstVal && "Expected constant value");
6709   MIB.addImm(*CstVal);
6710 }
6711 
6712 void AArch64InstructionSelector::renderLogicalImm32(
6713   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6714   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6715          "Expected G_CONSTANT");
6716   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6717   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6718   MIB.addImm(Enc);
6719 }
6720 
6721 void AArch64InstructionSelector::renderLogicalImm64(
6722   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6723   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6724          "Expected G_CONSTANT");
6725   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6726   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6727   MIB.addImm(Enc);
6728 }
6729 
6730 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6731                                                const MachineInstr &MI,
6732                                                int OpIdx) const {
6733   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6734          "Expected G_FCONSTANT");
6735   MIB.addImm(
6736       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6737 }
6738 
6739 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6740                                                const MachineInstr &MI,
6741                                                int OpIdx) const {
6742   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6743          "Expected G_FCONSTANT");
6744   MIB.addImm(
6745       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6746 }
6747 
6748 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6749                                                const MachineInstr &MI,
6750                                                int OpIdx) const {
6751   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6752          "Expected G_FCONSTANT");
6753   MIB.addImm(
6754       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6755 }
6756 
6757 void AArch64InstructionSelector::renderFPImm32SIMDModImmType4(
6758     MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
6759   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6760          "Expected G_FCONSTANT");
6761   MIB.addImm(AArch64_AM::encodeAdvSIMDModImmType4(MI.getOperand(1)
6762                                                       .getFPImm()
6763                                                       ->getValueAPF()
6764                                                       .bitcastToAPInt()
6765                                                       .getZExtValue()));
6766 }
6767 
6768 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6769     const MachineInstr &MI, unsigned NumBytes) const {
6770   if (!MI.mayLoadOrStore())
6771     return false;
6772   assert(MI.hasOneMemOperand() &&
6773          "Expected load/store to have only one mem op!");
6774   return (*MI.memoperands_begin())->getSize() == NumBytes;
6775 }
6776 
6777 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6778   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6779   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6780     return false;
6781 
6782   // Only return true if we know the operation will zero-out the high half of
6783   // the 64-bit register. Truncates can be subregister copies, which don't
6784   // zero out the high bits. Copies and other copy-like instructions can be
6785   // fed by truncates, or could be lowered as subregister copies.
6786   switch (MI.getOpcode()) {
6787   default:
6788     return true;
6789   case TargetOpcode::COPY:
6790   case TargetOpcode::G_BITCAST:
6791   case TargetOpcode::G_TRUNC:
6792   case TargetOpcode::G_PHI:
6793     return false;
6794   }
6795 }
6796 
6797 
6798 // Perform fixups on the given PHI instruction's operands to force them all
6799 // to be the same as the destination regbank.
6800 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6801                             const AArch64RegisterBankInfo &RBI) {
6802   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6803   Register DstReg = MI.getOperand(0).getReg();
6804   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6805   assert(DstRB && "Expected PHI dst to have regbank assigned");
6806   MachineIRBuilder MIB(MI);
6807 
6808   // Go through each operand and ensure it has the same regbank.
6809   for (MachineOperand &MO : llvm::drop_begin(MI.operands())) {
6810     if (!MO.isReg())
6811       continue;
6812     Register OpReg = MO.getReg();
6813     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6814     if (RB != DstRB) {
6815       // Insert a cross-bank copy.
6816       auto *OpDef = MRI.getVRegDef(OpReg);
6817       const LLT &Ty = MRI.getType(OpReg);
6818       MachineBasicBlock &OpDefBB = *OpDef->getParent();
6819 
6820       // Any instruction we insert must appear after all PHIs in the block
6821       // for the block to be valid MIR.
6822       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6823       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6824         InsertPt = OpDefBB.getFirstNonPHI();
6825       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6826       auto Copy = MIB.buildCopy(Ty, OpReg);
6827       MRI.setRegBank(Copy.getReg(0), *DstRB);
6828       MO.setReg(Copy.getReg(0));
6829     }
6830   }
6831 }
6832 
6833 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6834   // We're looking for PHIs, build a list so we don't invalidate iterators.
6835   MachineRegisterInfo &MRI = MF.getRegInfo();
6836   SmallVector<MachineInstr *, 32> Phis;
6837   for (auto &BB : MF) {
6838     for (auto &MI : BB) {
6839       if (MI.getOpcode() == TargetOpcode::G_PHI)
6840         Phis.emplace_back(&MI);
6841     }
6842   }
6843 
6844   for (auto *MI : Phis) {
6845     // We need to do some work here if the operand types are < 16 bit and they
6846     // are split across fpr/gpr banks. Since all types <32b on gpr
6847     // end up being assigned gpr32 regclasses, we can end up with PHIs here
6848     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6849     // be selecting heterogenous regbanks for operands if possible, but we
6850     // still need to be able to deal with it here.
6851     //
6852     // To fix this, if we have a gpr-bank operand < 32b in size and at least
6853     // one other operand is on the fpr bank, then we add cross-bank copies
6854     // to homogenize the operand banks. For simplicity the bank that we choose
6855     // to settle on is whatever bank the def operand has. For example:
6856     //
6857     // %endbb:
6858     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6859     //  =>
6860     // %bb2:
6861     //   ...
6862     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6863     //   ...
6864     // %endbb:
6865     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6866     bool HasGPROp = false, HasFPROp = false;
6867     for (const MachineOperand &MO : llvm::drop_begin(MI->operands())) {
6868       if (!MO.isReg())
6869         continue;
6870       const LLT &Ty = MRI.getType(MO.getReg());
6871       if (!Ty.isValid() || !Ty.isScalar())
6872         break;
6873       if (Ty.getSizeInBits() >= 32)
6874         break;
6875       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6876       // If for some reason we don't have a regbank yet. Don't try anything.
6877       if (!RB)
6878         break;
6879 
6880       if (RB->getID() == AArch64::GPRRegBankID)
6881         HasGPROp = true;
6882       else
6883         HasFPROp = true;
6884     }
6885     // We have heterogenous regbanks, need to fixup.
6886     if (HasGPROp && HasFPROp)
6887       fixupPHIOpBanks(*MI, MRI, RBI);
6888   }
6889 }
6890 
6891 namespace llvm {
6892 InstructionSelector *
6893 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6894                                  AArch64Subtarget &Subtarget,
6895                                  AArch64RegisterBankInfo &RBI) {
6896   return new AArch64InstructionSelector(TM, Subtarget, RBI);
6897 }
6898 }
6899