1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "AArch64GlobalISelUtils.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "MCTargetDesc/AArch64MCTargetDesc.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
27 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
28 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
29 #include "llvm/CodeGen/MachineBasicBlock.h"
30 #include "llvm/CodeGen/MachineConstantPool.h"
31 #include "llvm/CodeGen/MachineFunction.h"
32 #include "llvm/CodeGen/MachineInstr.h"
33 #include "llvm/CodeGen/MachineInstrBuilder.h"
34 #include "llvm/CodeGen/MachineMemOperand.h"
35 #include "llvm/CodeGen/MachineOperand.h"
36 #include "llvm/CodeGen/MachineRegisterInfo.h"
37 #include "llvm/CodeGen/TargetOpcodes.h"
38 #include "llvm/IR/Constants.h"
39 #include "llvm/IR/DerivedTypes.h"
40 #include "llvm/IR/Instructions.h"
41 #include "llvm/IR/PatternMatch.h"
42 #include "llvm/IR/Type.h"
43 #include "llvm/IR/IntrinsicsAArch64.h"
44 #include "llvm/Pass.h"
45 #include "llvm/Support/Debug.h"
46 #include "llvm/Support/raw_ostream.h"
47 
48 #define DEBUG_TYPE "aarch64-isel"
49 
50 using namespace llvm;
51 using namespace MIPatternMatch;
52 using namespace AArch64GISelUtils;
53 
54 namespace llvm {
55 class BlockFrequencyInfo;
56 class ProfileSummaryInfo;
57 }
58 
59 namespace {
60 
61 #define GET_GLOBALISEL_PREDICATE_BITSET
62 #include "AArch64GenGlobalISel.inc"
63 #undef GET_GLOBALISEL_PREDICATE_BITSET
64 
65 class AArch64InstructionSelector : public InstructionSelector {
66 public:
67   AArch64InstructionSelector(const AArch64TargetMachine &TM,
68                              const AArch64Subtarget &STI,
69                              const AArch64RegisterBankInfo &RBI);
70 
71   bool select(MachineInstr &I) override;
getName()72   static const char *getName() { return DEBUG_TYPE; }
73 
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)74   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
75                CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
76                BlockFrequencyInfo *BFI) override {
77     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
78     MIB.setMF(MF);
79 
80     // hasFnAttribute() is expensive to call on every BRCOND selection, so
81     // cache it here for each run of the selector.
82     ProduceNonFlagSettingCondBr =
83         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
84     MFReturnAddr = Register();
85 
86     processPHIs(MF);
87   }
88 
89 private:
90   /// tblgen-erated 'select' implementation, used as the initial selector for
91   /// the patterns that don't require complex C++.
92   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
93 
94   // A lowering phase that runs before any selection attempts.
95   // Returns true if the instruction was modified.
96   bool preISelLower(MachineInstr &I);
97 
98   // An early selection function that runs before the selectImpl() call.
99   bool earlySelect(MachineInstr &I);
100 
101   // Do some preprocessing of G_PHIs before we begin selection.
102   void processPHIs(MachineFunction &MF);
103 
104   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
105 
106   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
107   bool contractCrossBankCopyIntoStore(MachineInstr &I,
108                                       MachineRegisterInfo &MRI);
109 
110   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
111 
112   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
113                           MachineRegisterInfo &MRI) const;
114   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
115                            MachineRegisterInfo &MRI) const;
116 
117   ///@{
118   /// Helper functions for selectCompareBranch.
119   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
120                                     MachineIRBuilder &MIB) const;
121   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
122                                     MachineIRBuilder &MIB) const;
123   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
124                                     MachineIRBuilder &MIB) const;
125   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
126                                   MachineBasicBlock *DstMBB,
127                                   MachineIRBuilder &MIB) const;
128   ///@}
129 
130   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
131                            MachineRegisterInfo &MRI);
132 
133   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
134   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
135 
136   // Helper to generate an equivalent of scalar_to_vector into a new register,
137   // returned via 'Dst'.
138   MachineInstr *emitScalarToVector(unsigned EltSize,
139                                    const TargetRegisterClass *DstRC,
140                                    Register Scalar,
141                                    MachineIRBuilder &MIRBuilder) const;
142 
143   /// Emit a lane insert into \p DstReg, or a new vector register if None is
144   /// provided.
145   ///
146   /// The lane inserted into is defined by \p LaneIdx. The vector source
147   /// register is given by \p SrcReg. The register containing the element is
148   /// given by \p EltReg.
149   MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
150                                Register EltReg, unsigned LaneIdx,
151                                const RegisterBank &RB,
152                                MachineIRBuilder &MIRBuilder) const;
153 
154   /// Emit a sequence of instructions representing a constant \p CV for a
155   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
156   ///
157   /// \returns the last instruction in the sequence on success, and nullptr
158   /// otherwise.
159   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
160                                    MachineIRBuilder &MIRBuilder,
161                                    MachineRegisterInfo &MRI);
162 
163   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
164   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
165                               MachineRegisterInfo &MRI);
166   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
167   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
168   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
169 
170   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
171   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
172   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
173   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
174   bool selectIntrinsicWithSideEffects(MachineInstr &I,
175                                       MachineRegisterInfo &MRI);
176   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
177   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
178   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
179   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
180   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
181   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
182   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
183   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
184 
185   unsigned emitConstantPoolEntry(const Constant *CPVal,
186                                  MachineFunction &MF) const;
187   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
188                                          MachineIRBuilder &MIRBuilder) const;
189 
190   // Emit a vector concat operation.
191   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
192                                  Register Op2,
193                                  MachineIRBuilder &MIRBuilder) const;
194 
195   // Emit an integer compare between LHS and RHS, which checks for Predicate.
196   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
197                                    MachineOperand &Predicate,
198                                    MachineIRBuilder &MIRBuilder) const;
199 
200   /// Emit a floating point comparison between \p LHS and \p RHS.
201   /// \p Pred if given is the intended predicate to use.
202   MachineInstr *emitFPCompare(Register LHS, Register RHS,
203                               MachineIRBuilder &MIRBuilder,
204                               Optional<CmpInst::Predicate> = None) const;
205 
206   MachineInstr *emitInstr(unsigned Opcode,
207                           std::initializer_list<llvm::DstOp> DstOps,
208                           std::initializer_list<llvm::SrcOp> SrcOps,
209                           MachineIRBuilder &MIRBuilder,
210                           const ComplexRendererFns &RenderFns = None) const;
211   /// Helper function to emit an add or sub instruction.
212   ///
213   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
214   /// in a specific order.
215   ///
216   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
217   ///
218   /// \code
219   ///   const std::array<std::array<unsigned, 2>, 4> Table {
220   ///    {{AArch64::ADDXri, AArch64::ADDWri},
221   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
222   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
223   ///     {AArch64::SUBXri, AArch64::SUBWri},
224   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
225   /// \endcode
226   ///
227   /// Each row in the table corresponds to a different addressing mode. Each
228   /// column corresponds to a different register size.
229   ///
230   /// \attention Rows must be structured as follows:
231   ///   - Row 0: The ri opcode variants
232   ///   - Row 1: The rs opcode variants
233   ///   - Row 2: The rr opcode variants
234   ///   - Row 3: The ri opcode variants for negative immediates
235   ///   - Row 4: The rx opcode variants
236   ///
237   /// \attention Columns must be structured as follows:
238   ///   - Column 0: The 64-bit opcode variants
239   ///   - Column 1: The 32-bit opcode variants
240   ///
241   /// \p Dst is the destination register of the binop to emit.
242   /// \p LHS is the left-hand operand of the binop to emit.
243   /// \p RHS is the right-hand operand of the binop to emit.
244   MachineInstr *emitAddSub(
245       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
246       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
247       MachineIRBuilder &MIRBuilder) const;
248   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
249                         MachineOperand &RHS,
250                         MachineIRBuilder &MIRBuilder) const;
251   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
252                          MachineIRBuilder &MIRBuilder) const;
253   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
254                          MachineIRBuilder &MIRBuilder) const;
255   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
256                         MachineIRBuilder &MIRBuilder) const;
257   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
258                         MachineIRBuilder &MIRBuilder) const;
259   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
260                            AArch64CC::CondCode CC,
261                            MachineIRBuilder &MIRBuilder) const;
262   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
263                                      const RegisterBank &DstRB, LLT ScalarTy,
264                                      Register VecReg, unsigned LaneIdx,
265                                      MachineIRBuilder &MIRBuilder) const;
266 
267   /// Emit a CSet for an integer compare.
268   ///
269   /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
270   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
271                                 MachineIRBuilder &MIRBuilder,
272                                 Register SrcReg = AArch64::WZR) const;
273   /// Emit a CSet for a FP compare.
274   ///
275   /// \p Dst is expected to be a 32-bit scalar register.
276   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
277                                 MachineIRBuilder &MIRBuilder) const;
278 
279   /// Emit the overflow op for \p Opcode.
280   ///
281   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
282   /// G_USUBO, etc.
283   std::pair<MachineInstr *, AArch64CC::CondCode>
284   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
285                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
286 
287   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
288   /// \p IsNegative is true if the test should be "not zero".
289   /// This will also optimize the test bit instruction when possible.
290   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
291                             MachineBasicBlock *DstMBB,
292                             MachineIRBuilder &MIB) const;
293 
294   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
295   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
296                         MachineBasicBlock *DestMBB,
297                         MachineIRBuilder &MIB) const;
298 
299   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
300   // We use these manually instead of using the importer since it doesn't
301   // support SDNodeXForm.
302   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
303   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
304   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
305   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
306 
307   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
308   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
309   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
310 
311   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
312                                             unsigned Size) const;
313 
selectAddrModeUnscaled8(MachineOperand & Root) const314   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
315     return selectAddrModeUnscaled(Root, 1);
316   }
selectAddrModeUnscaled16(MachineOperand & Root) const317   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
318     return selectAddrModeUnscaled(Root, 2);
319   }
selectAddrModeUnscaled32(MachineOperand & Root) const320   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
321     return selectAddrModeUnscaled(Root, 4);
322   }
selectAddrModeUnscaled64(MachineOperand & Root) const323   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
324     return selectAddrModeUnscaled(Root, 8);
325   }
selectAddrModeUnscaled128(MachineOperand & Root) const326   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
327     return selectAddrModeUnscaled(Root, 16);
328   }
329 
330   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
331   /// from complex pattern matchers like selectAddrModeIndexed().
332   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
333                                           MachineRegisterInfo &MRI) const;
334 
335   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
336                                            unsigned Size) const;
337   template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const338   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
339     return selectAddrModeIndexed(Root, Width / 8);
340   }
341 
342   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
343                                      const MachineRegisterInfo &MRI) const;
344   ComplexRendererFns
345   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
346                                   unsigned SizeInBytes) const;
347 
348   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
349   /// or not a shift + extend should be folded into an addressing mode. Returns
350   /// None when this is not profitable or possible.
351   ComplexRendererFns
352   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
353                     MachineOperand &Offset, unsigned SizeInBytes,
354                     bool WantsExt) const;
355   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
356   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
357                                        unsigned SizeInBytes) const;
358   template <int Width>
selectAddrModeXRO(MachineOperand & Root) const359   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
360     return selectAddrModeXRO(Root, Width / 8);
361   }
362 
363   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
364                                        unsigned SizeInBytes) const;
365   template <int Width>
selectAddrModeWRO(MachineOperand & Root) const366   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
367     return selectAddrModeWRO(Root, Width / 8);
368   }
369 
370   ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
371 
selectArithShiftedRegister(MachineOperand & Root) const372   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
373     return selectShiftedRegister(Root);
374   }
375 
selectLogicalShiftedRegister(MachineOperand & Root) const376   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
377     // TODO: selectShiftedRegister should allow for rotates on logical shifts.
378     // For now, make them the same. The only difference between the two is that
379     // logical shifts are allowed to fold in rotates. Otherwise, these are
380     // functionally the same.
381     return selectShiftedRegister(Root);
382   }
383 
384   /// Given an extend instruction, determine the correct shift-extend type for
385   /// that instruction.
386   ///
387   /// If the instruction is going to be used in a load or store, pass
388   /// \p IsLoadStore = true.
389   AArch64_AM::ShiftExtendType
390   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
391                        bool IsLoadStore = false) const;
392 
393   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
394   ///
395   /// \returns Either \p Reg if no change was necessary, or the new register
396   /// created by moving \p Reg.
397   ///
398   /// Note: This uses emitCopy right now.
399   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
400                               MachineIRBuilder &MIB) const;
401 
402   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
403 
404   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
405                       int OpIdx = -1) const;
406   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
407                           int OpIdx = -1) const;
408   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
409                           int OpIdx = -1) const;
410   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
411                      int OpIdx = -1) const;
412   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
413                      int OpIdx = -1) const;
414   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
415                      int OpIdx = -1) const;
416 
417   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
418   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
419 
420   // Optimization methods.
421   bool tryOptSelect(MachineInstr &MI);
422   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
423                                       MachineOperand &Predicate,
424                                       MachineIRBuilder &MIRBuilder) const;
425 
426   /// Return true if \p MI is a load or store of \p NumBytes bytes.
427   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
428 
429   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
430   /// register zeroed out. In other words, the result of MI has been explicitly
431   /// zero extended.
432   bool isDef32(const MachineInstr &MI) const;
433 
434   const AArch64TargetMachine &TM;
435   const AArch64Subtarget &STI;
436   const AArch64InstrInfo &TII;
437   const AArch64RegisterInfo &TRI;
438   const AArch64RegisterBankInfo &RBI;
439 
440   bool ProduceNonFlagSettingCondBr = false;
441 
442   // Some cached values used during selection.
443   // We use LR as a live-in register, and we keep track of it here as it can be
444   // clobbered by calls.
445   Register MFReturnAddr;
446 
447   MachineIRBuilder MIB;
448 
449 #define GET_GLOBALISEL_PREDICATES_DECL
450 #include "AArch64GenGlobalISel.inc"
451 #undef GET_GLOBALISEL_PREDICATES_DECL
452 
453 // We declare the temporaries used by selectImpl() in the class to minimize the
454 // cost of constructing placeholder values.
455 #define GET_GLOBALISEL_TEMPORARIES_DECL
456 #include "AArch64GenGlobalISel.inc"
457 #undef GET_GLOBALISEL_TEMPORARIES_DECL
458 };
459 
460 } // end anonymous namespace
461 
462 #define GET_GLOBALISEL_IMPL
463 #include "AArch64GenGlobalISel.inc"
464 #undef GET_GLOBALISEL_IMPL
465 
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)466 AArch64InstructionSelector::AArch64InstructionSelector(
467     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
468     const AArch64RegisterBankInfo &RBI)
469     : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
470       TRI(*STI.getRegisterInfo()), RBI(RBI),
471 #define GET_GLOBALISEL_PREDICATES_INIT
472 #include "AArch64GenGlobalISel.inc"
473 #undef GET_GLOBALISEL_PREDICATES_INIT
474 #define GET_GLOBALISEL_TEMPORARIES_INIT
475 #include "AArch64GenGlobalISel.inc"
476 #undef GET_GLOBALISEL_TEMPORARIES_INIT
477 {
478 }
479 
480 // FIXME: This should be target-independent, inferred from the types declared
481 // for each class in the bank.
482 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,const RegisterBankInfo & RBI,bool GetAllRegSet=false)483 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
484                          const RegisterBankInfo &RBI,
485                          bool GetAllRegSet = false) {
486   if (RB.getID() == AArch64::GPRRegBankID) {
487     if (Ty.getSizeInBits() <= 32)
488       return GetAllRegSet ? &AArch64::GPR32allRegClass
489                           : &AArch64::GPR32RegClass;
490     if (Ty.getSizeInBits() == 64)
491       return GetAllRegSet ? &AArch64::GPR64allRegClass
492                           : &AArch64::GPR64RegClass;
493     if (Ty.getSizeInBits() == 128)
494       return &AArch64::XSeqPairsClassRegClass;
495     return nullptr;
496   }
497 
498   if (RB.getID() == AArch64::FPRRegBankID) {
499     if (Ty.getSizeInBits() <= 16)
500       return &AArch64::FPR16RegClass;
501     if (Ty.getSizeInBits() == 32)
502       return &AArch64::FPR32RegClass;
503     if (Ty.getSizeInBits() == 64)
504       return &AArch64::FPR64RegClass;
505     if (Ty.getSizeInBits() == 128)
506       return &AArch64::FPR128RegClass;
507     return nullptr;
508   }
509 
510   return nullptr;
511 }
512 
513 /// Given a register bank, and size in bits, return the smallest register class
514 /// that can represent that combination.
515 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,unsigned SizeInBits,bool GetAllRegSet=false)516 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
517                       bool GetAllRegSet = false) {
518   unsigned RegBankID = RB.getID();
519 
520   if (RegBankID == AArch64::GPRRegBankID) {
521     if (SizeInBits <= 32)
522       return GetAllRegSet ? &AArch64::GPR32allRegClass
523                           : &AArch64::GPR32RegClass;
524     if (SizeInBits == 64)
525       return GetAllRegSet ? &AArch64::GPR64allRegClass
526                           : &AArch64::GPR64RegClass;
527     if (SizeInBits == 128)
528       return &AArch64::XSeqPairsClassRegClass;
529   }
530 
531   if (RegBankID == AArch64::FPRRegBankID) {
532     switch (SizeInBits) {
533     default:
534       return nullptr;
535     case 8:
536       return &AArch64::FPR8RegClass;
537     case 16:
538       return &AArch64::FPR16RegClass;
539     case 32:
540       return &AArch64::FPR32RegClass;
541     case 64:
542       return &AArch64::FPR64RegClass;
543     case 128:
544       return &AArch64::FPR128RegClass;
545     }
546   }
547 
548   return nullptr;
549 }
550 
551 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)552 static bool getSubRegForClass(const TargetRegisterClass *RC,
553                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
554   switch (TRI.getRegSizeInBits(*RC)) {
555   case 8:
556     SubReg = AArch64::bsub;
557     break;
558   case 16:
559     SubReg = AArch64::hsub;
560     break;
561   case 32:
562     if (RC != &AArch64::FPR32RegClass)
563       SubReg = AArch64::sub_32;
564     else
565       SubReg = AArch64::ssub;
566     break;
567   case 64:
568     SubReg = AArch64::dsub;
569     break;
570   default:
571     LLVM_DEBUG(
572         dbgs() << "Couldn't find appropriate subregister for register class.");
573     return false;
574   }
575 
576   return true;
577 }
578 
579 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)580 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
581   switch (RB.getID()) {
582   case AArch64::GPRRegBankID:
583     return 32;
584   case AArch64::FPRRegBankID:
585     return 8;
586   default:
587     llvm_unreachable("Tried to get minimum size for unknown register bank.");
588   }
589 }
590 
591 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
592 /// Helper function for functions like createDTuple and createQTuple.
593 ///
594 /// \p RegClassIDs - The list of register class IDs available for some tuple of
595 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
596 /// expected to contain between 2 and 4 tuple classes.
597 ///
598 /// \p SubRegs - The list of subregister classes associated with each register
599 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
600 /// subregister class. The index of each subregister class is expected to
601 /// correspond with the index of each register class.
602 ///
603 /// \returns Either the destination register of REG_SEQUENCE instruction that
604 /// was created, or the 0th element of \p Regs if \p Regs contains a single
605 /// element.
createTuple(ArrayRef<Register> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[],MachineIRBuilder & MIB)606 static Register createTuple(ArrayRef<Register> Regs,
607                             const unsigned RegClassIDs[],
608                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
609   unsigned NumRegs = Regs.size();
610   if (NumRegs == 1)
611     return Regs[0];
612   assert(NumRegs >= 2 && NumRegs <= 4 &&
613          "Only support between two and 4 registers in a tuple!");
614   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
615   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
616   auto RegSequence =
617       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
618   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
619     RegSequence.addUse(Regs[I]);
620     RegSequence.addImm(SubRegs[I]);
621   }
622   return RegSequence.getReg(0);
623 }
624 
625 /// Create a tuple of D-registers using the registers in \p Regs.
createDTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)626 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
627   static const unsigned RegClassIDs[] = {
628       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
629   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
630                                      AArch64::dsub2, AArch64::dsub3};
631   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
632 }
633 
634 /// Create a tuple of Q-registers using the registers in \p Regs.
createQTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)635 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
636   static const unsigned RegClassIDs[] = {
637       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
638   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
639                                      AArch64::qsub2, AArch64::qsub3};
640   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
641 }
642 
getImmedFromMO(const MachineOperand & Root)643 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
644   auto &MI = *Root.getParent();
645   auto &MBB = *MI.getParent();
646   auto &MF = *MBB.getParent();
647   auto &MRI = MF.getRegInfo();
648   uint64_t Immed;
649   if (Root.isImm())
650     Immed = Root.getImm();
651   else if (Root.isCImm())
652     Immed = Root.getCImm()->getZExtValue();
653   else if (Root.isReg()) {
654     auto ValAndVReg =
655         getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
656     if (!ValAndVReg)
657       return None;
658     Immed = ValAndVReg->Value.getSExtValue();
659   } else
660     return None;
661   return Immed;
662 }
663 
664 /// Check whether \p I is a currently unsupported binary operation:
665 /// - it has an unsized type
666 /// - an operand is not a vreg
667 /// - all operands are not in the same bank
668 /// These are checks that should someday live in the verifier, but right now,
669 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)670 static bool unsupportedBinOp(const MachineInstr &I,
671                              const AArch64RegisterBankInfo &RBI,
672                              const MachineRegisterInfo &MRI,
673                              const AArch64RegisterInfo &TRI) {
674   LLT Ty = MRI.getType(I.getOperand(0).getReg());
675   if (!Ty.isValid()) {
676     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
677     return true;
678   }
679 
680   const RegisterBank *PrevOpBank = nullptr;
681   for (auto &MO : I.operands()) {
682     // FIXME: Support non-register operands.
683     if (!MO.isReg()) {
684       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
685       return true;
686     }
687 
688     // FIXME: Can generic operations have physical registers operands? If
689     // so, this will need to be taught about that, and we'll need to get the
690     // bank out of the minimal class for the register.
691     // Either way, this needs to be documented (and possibly verified).
692     if (!Register::isVirtualRegister(MO.getReg())) {
693       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
694       return true;
695     }
696 
697     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
698     if (!OpBank) {
699       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
700       return true;
701     }
702 
703     if (PrevOpBank && OpBank != PrevOpBank) {
704       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
705       return true;
706     }
707     PrevOpBank = OpBank;
708   }
709   return false;
710 }
711 
712 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
713 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
714 /// and of size \p OpSize.
715 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)716 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
717                                unsigned OpSize) {
718   switch (RegBankID) {
719   case AArch64::GPRRegBankID:
720     if (OpSize == 32) {
721       switch (GenericOpc) {
722       case TargetOpcode::G_SHL:
723         return AArch64::LSLVWr;
724       case TargetOpcode::G_LSHR:
725         return AArch64::LSRVWr;
726       case TargetOpcode::G_ASHR:
727         return AArch64::ASRVWr;
728       default:
729         return GenericOpc;
730       }
731     } else if (OpSize == 64) {
732       switch (GenericOpc) {
733       case TargetOpcode::G_PTR_ADD:
734         return AArch64::ADDXrr;
735       case TargetOpcode::G_SHL:
736         return AArch64::LSLVXr;
737       case TargetOpcode::G_LSHR:
738         return AArch64::LSRVXr;
739       case TargetOpcode::G_ASHR:
740         return AArch64::ASRVXr;
741       default:
742         return GenericOpc;
743       }
744     }
745     break;
746   case AArch64::FPRRegBankID:
747     switch (OpSize) {
748     case 32:
749       switch (GenericOpc) {
750       case TargetOpcode::G_FADD:
751         return AArch64::FADDSrr;
752       case TargetOpcode::G_FSUB:
753         return AArch64::FSUBSrr;
754       case TargetOpcode::G_FMUL:
755         return AArch64::FMULSrr;
756       case TargetOpcode::G_FDIV:
757         return AArch64::FDIVSrr;
758       default:
759         return GenericOpc;
760       }
761     case 64:
762       switch (GenericOpc) {
763       case TargetOpcode::G_FADD:
764         return AArch64::FADDDrr;
765       case TargetOpcode::G_FSUB:
766         return AArch64::FSUBDrr;
767       case TargetOpcode::G_FMUL:
768         return AArch64::FMULDrr;
769       case TargetOpcode::G_FDIV:
770         return AArch64::FDIVDrr;
771       case TargetOpcode::G_OR:
772         return AArch64::ORRv8i8;
773       default:
774         return GenericOpc;
775       }
776     }
777     break;
778   }
779   return GenericOpc;
780 }
781 
782 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
783 /// appropriate for the (value) register bank \p RegBankID and of memory access
784 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
785 /// addressing mode (e.g., LDRXui).
786 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)787 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
788                                     unsigned OpSize) {
789   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
790   switch (RegBankID) {
791   case AArch64::GPRRegBankID:
792     switch (OpSize) {
793     case 8:
794       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
795     case 16:
796       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
797     case 32:
798       return isStore ? AArch64::STRWui : AArch64::LDRWui;
799     case 64:
800       return isStore ? AArch64::STRXui : AArch64::LDRXui;
801     }
802     break;
803   case AArch64::FPRRegBankID:
804     switch (OpSize) {
805     case 8:
806       return isStore ? AArch64::STRBui : AArch64::LDRBui;
807     case 16:
808       return isStore ? AArch64::STRHui : AArch64::LDRHui;
809     case 32:
810       return isStore ? AArch64::STRSui : AArch64::LDRSui;
811     case 64:
812       return isStore ? AArch64::STRDui : AArch64::LDRDui;
813     }
814     break;
815   }
816   return GenericOpc;
817 }
818 
819 #ifndef NDEBUG
820 /// Helper function that verifies that we have a valid copy at the end of
821 /// selectCopy. Verifies that the source and dest have the expected sizes and
822 /// then returns true.
isValidCopy(const MachineInstr & I,const RegisterBank & DstBank,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)823 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
824                         const MachineRegisterInfo &MRI,
825                         const TargetRegisterInfo &TRI,
826                         const RegisterBankInfo &RBI) {
827   const Register DstReg = I.getOperand(0).getReg();
828   const Register SrcReg = I.getOperand(1).getReg();
829   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
830   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
831 
832   // Make sure the size of the source and dest line up.
833   assert(
834       (DstSize == SrcSize ||
835        // Copies are a mean to setup initial types, the number of
836        // bits may not exactly match.
837        (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
838        // Copies are a mean to copy bits around, as long as we are
839        // on the same register class, that's fine. Otherwise, that
840        // means we need some SUBREG_TO_REG or AND & co.
841        (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
842       "Copy with different width?!");
843 
844   // Check the size of the destination.
845   assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
846          "GPRs cannot get more than 64-bit width values");
847 
848   return true;
849 }
850 #endif
851 
852 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
853 /// to \p *To.
854 ///
855 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)856 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
857                        const RegisterBankInfo &RBI, Register SrcReg,
858                        const TargetRegisterClass *To, unsigned SubReg) {
859   assert(SrcReg.isValid() && "Expected a valid source register?");
860   assert(To && "Destination register class cannot be null");
861   assert(SubReg && "Expected a valid subregister");
862 
863   MachineIRBuilder MIB(I);
864   auto SubRegCopy =
865       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
866   MachineOperand &RegOp = I.getOperand(1);
867   RegOp.setReg(SubRegCopy.getReg(0));
868 
869   // It's possible that the destination register won't be constrained. Make
870   // sure that happens.
871   if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
872     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
873 
874   return true;
875 }
876 
877 /// Helper function to get the source and destination register classes for a
878 /// copy. Returns a std::pair containing the source register class for the
879 /// copy, and the destination register class for the copy. If a register class
880 /// cannot be determined, then it will be nullptr.
881 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)882 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
883                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
884                      const RegisterBankInfo &RBI) {
885   Register DstReg = I.getOperand(0).getReg();
886   Register SrcReg = I.getOperand(1).getReg();
887   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
888   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
889   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
890   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
891 
892   // Special casing for cross-bank copies of s1s. We can technically represent
893   // a 1-bit value with any size of register. The minimum size for a GPR is 32
894   // bits. So, we need to put the FPR on 32 bits as well.
895   //
896   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
897   // then we can pull it into the helpers that get the appropriate class for a
898   // register bank. Or make a new helper that carries along some constraint
899   // information.
900   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
901     SrcSize = DstSize = 32;
902 
903   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
904           getMinClassForRegBank(DstRegBank, DstSize, true)};
905 }
906 
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)907 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
908                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
909                        const RegisterBankInfo &RBI) {
910   Register DstReg = I.getOperand(0).getReg();
911   Register SrcReg = I.getOperand(1).getReg();
912   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
913   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
914 
915   // Find the correct register classes for the source and destination registers.
916   const TargetRegisterClass *SrcRC;
917   const TargetRegisterClass *DstRC;
918   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
919 
920   if (!DstRC) {
921     LLVM_DEBUG(dbgs() << "Unexpected dest size "
922                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
923     return false;
924   }
925 
926   // A couple helpers below, for making sure that the copy we produce is valid.
927 
928   // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
929   // to verify that the src and dst are the same size, since that's handled by
930   // the SUBREG_TO_REG.
931   bool KnownValid = false;
932 
933   // Returns true, or asserts if something we don't expect happens. Instead of
934   // returning true, we return isValidCopy() to ensure that we verify the
935   // result.
936   auto CheckCopy = [&]() {
937     // If we have a bitcast or something, we can't have physical registers.
938     assert((I.isCopy() ||
939             (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
940              !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
941            "No phys reg on generic operator!");
942     bool ValidCopy = true;
943 #ifndef NDEBUG
944     ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
945     assert(ValidCopy && "Invalid copy.");
946 #endif
947     (void)KnownValid;
948     return ValidCopy;
949   };
950 
951   // Is this a copy? If so, then we may need to insert a subregister copy.
952   if (I.isCopy()) {
953     // Yes. Check if there's anything to fix up.
954     if (!SrcRC) {
955       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
956       return false;
957     }
958 
959     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
960     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
961     unsigned SubReg;
962 
963     // If the source bank doesn't support a subregister copy small enough,
964     // then we first need to copy to the destination bank.
965     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
966       const TargetRegisterClass *DstTempRC =
967           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
968       getSubRegForClass(DstRC, TRI, SubReg);
969 
970       MachineIRBuilder MIB(I);
971       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
972       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
973     } else if (SrcSize > DstSize) {
974       // If the source register is bigger than the destination we need to
975       // perform a subregister copy.
976       const TargetRegisterClass *SubRegRC =
977           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
978       getSubRegForClass(SubRegRC, TRI, SubReg);
979       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
980     } else if (DstSize > SrcSize) {
981       // If the destination register is bigger than the source we need to do
982       // a promotion using SUBREG_TO_REG.
983       const TargetRegisterClass *PromotionRC =
984           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
985       getSubRegForClass(SrcRC, TRI, SubReg);
986 
987       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
988       BuildMI(*I.getParent(), I, I.getDebugLoc(),
989               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
990           .addImm(0)
991           .addUse(SrcReg)
992           .addImm(SubReg);
993       MachineOperand &RegOp = I.getOperand(1);
994       RegOp.setReg(PromoteReg);
995 
996       // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
997       KnownValid = true;
998     }
999 
1000     // If the destination is a physical register, then there's nothing to
1001     // change, so we're done.
1002     if (Register::isPhysicalRegister(DstReg))
1003       return CheckCopy();
1004   }
1005 
1006   // No need to constrain SrcReg. It will get constrained when we hit another
1007   // of its use or its defs. Copies do not have constraints.
1008   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1009     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1010                       << " operand\n");
1011     return false;
1012   }
1013 
1014   // If this a GPR ZEXT that we want to just reduce down into a copy.
1015   // The sizes will be mismatched with the source < 32b but that's ok.
1016   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1017     I.setDesc(TII.get(AArch64::COPY));
1018     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1019     return selectCopy(I, TII, MRI, TRI, RBI);
1020   }
1021 
1022   I.setDesc(TII.get(AArch64::COPY));
1023   return CheckCopy();
1024 }
1025 
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)1026 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1027   if (!DstTy.isScalar() || !SrcTy.isScalar())
1028     return GenericOpc;
1029 
1030   const unsigned DstSize = DstTy.getSizeInBits();
1031   const unsigned SrcSize = SrcTy.getSizeInBits();
1032 
1033   switch (DstSize) {
1034   case 32:
1035     switch (SrcSize) {
1036     case 32:
1037       switch (GenericOpc) {
1038       case TargetOpcode::G_SITOFP:
1039         return AArch64::SCVTFUWSri;
1040       case TargetOpcode::G_UITOFP:
1041         return AArch64::UCVTFUWSri;
1042       case TargetOpcode::G_FPTOSI:
1043         return AArch64::FCVTZSUWSr;
1044       case TargetOpcode::G_FPTOUI:
1045         return AArch64::FCVTZUUWSr;
1046       default:
1047         return GenericOpc;
1048       }
1049     case 64:
1050       switch (GenericOpc) {
1051       case TargetOpcode::G_SITOFP:
1052         return AArch64::SCVTFUXSri;
1053       case TargetOpcode::G_UITOFP:
1054         return AArch64::UCVTFUXSri;
1055       case TargetOpcode::G_FPTOSI:
1056         return AArch64::FCVTZSUWDr;
1057       case TargetOpcode::G_FPTOUI:
1058         return AArch64::FCVTZUUWDr;
1059       default:
1060         return GenericOpc;
1061       }
1062     default:
1063       return GenericOpc;
1064     }
1065   case 64:
1066     switch (SrcSize) {
1067     case 32:
1068       switch (GenericOpc) {
1069       case TargetOpcode::G_SITOFP:
1070         return AArch64::SCVTFUWDri;
1071       case TargetOpcode::G_UITOFP:
1072         return AArch64::UCVTFUWDri;
1073       case TargetOpcode::G_FPTOSI:
1074         return AArch64::FCVTZSUXSr;
1075       case TargetOpcode::G_FPTOUI:
1076         return AArch64::FCVTZUUXSr;
1077       default:
1078         return GenericOpc;
1079       }
1080     case 64:
1081       switch (GenericOpc) {
1082       case TargetOpcode::G_SITOFP:
1083         return AArch64::SCVTFUXDri;
1084       case TargetOpcode::G_UITOFP:
1085         return AArch64::UCVTFUXDri;
1086       case TargetOpcode::G_FPTOSI:
1087         return AArch64::FCVTZSUXDr;
1088       case TargetOpcode::G_FPTOUI:
1089         return AArch64::FCVTZUUXDr;
1090       default:
1091         return GenericOpc;
1092       }
1093     default:
1094       return GenericOpc;
1095     }
1096   default:
1097     return GenericOpc;
1098   };
1099   return GenericOpc;
1100 }
1101 
1102 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1103 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1104                                        Register False, AArch64CC::CondCode CC,
1105                                        MachineIRBuilder &MIB) const {
1106   MachineRegisterInfo &MRI = *MIB.getMRI();
1107   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1108              RBI.getRegBank(True, MRI, TRI)->getID() &&
1109          "Expected both select operands to have the same regbank?");
1110   LLT Ty = MRI.getType(True);
1111   if (Ty.isVector())
1112     return nullptr;
1113   const unsigned Size = Ty.getSizeInBits();
1114   assert((Size == 32 || Size == 64) &&
1115          "Expected 32 bit or 64 bit select only?");
1116   const bool Is32Bit = Size == 32;
1117   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1118     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1119     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1120     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1121     return &*FCSel;
1122   }
1123 
1124   // By default, we'll try and emit a CSEL.
1125   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1126   bool Optimized = false;
1127   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1128                                  &Optimized](Register &Reg, Register &OtherReg,
1129                                              bool Invert) {
1130     if (Optimized)
1131       return false;
1132 
1133     // Attempt to fold:
1134     //
1135     // %sub = G_SUB 0, %x
1136     // %select = G_SELECT cc, %reg, %sub
1137     //
1138     // Into:
1139     // %select = CSNEG %reg, %x, cc
1140     Register MatchReg;
1141     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1142       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1143       Reg = MatchReg;
1144       if (Invert) {
1145         CC = AArch64CC::getInvertedCondCode(CC);
1146         std::swap(Reg, OtherReg);
1147       }
1148       return true;
1149     }
1150 
1151     // Attempt to fold:
1152     //
1153     // %xor = G_XOR %x, -1
1154     // %select = G_SELECT cc, %reg, %xor
1155     //
1156     // Into:
1157     // %select = CSINV %reg, %x, cc
1158     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1159       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1160       Reg = MatchReg;
1161       if (Invert) {
1162         CC = AArch64CC::getInvertedCondCode(CC);
1163         std::swap(Reg, OtherReg);
1164       }
1165       return true;
1166     }
1167 
1168     // Attempt to fold:
1169     //
1170     // %add = G_ADD %x, 1
1171     // %select = G_SELECT cc, %reg, %add
1172     //
1173     // Into:
1174     // %select = CSINC %reg, %x, cc
1175     if (mi_match(Reg, MRI,
1176                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1177                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1178       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1179       Reg = MatchReg;
1180       if (Invert) {
1181         CC = AArch64CC::getInvertedCondCode(CC);
1182         std::swap(Reg, OtherReg);
1183       }
1184       return true;
1185     }
1186 
1187     return false;
1188   };
1189 
1190   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1191   // true/false values are constants.
1192   // FIXME: All of these patterns already exist in tablegen. We should be
1193   // able to import these.
1194   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1195                           &Optimized]() {
1196     if (Optimized)
1197       return false;
1198     auto TrueCst = getConstantVRegValWithLookThrough(True, MRI);
1199     auto FalseCst = getConstantVRegValWithLookThrough(False, MRI);
1200     if (!TrueCst && !FalseCst)
1201       return false;
1202 
1203     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1204     if (TrueCst && FalseCst) {
1205       int64_t T = TrueCst->Value.getSExtValue();
1206       int64_t F = FalseCst->Value.getSExtValue();
1207 
1208       if (T == 0 && F == 1) {
1209         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1210         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1211         True = ZReg;
1212         False = ZReg;
1213         return true;
1214       }
1215 
1216       if (T == 0 && F == -1) {
1217         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1218         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1219         True = ZReg;
1220         False = ZReg;
1221         return true;
1222       }
1223     }
1224 
1225     if (TrueCst) {
1226       int64_t T = TrueCst->Value.getSExtValue();
1227       if (T == 1) {
1228         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1229         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1230         True = False;
1231         False = ZReg;
1232         CC = AArch64CC::getInvertedCondCode(CC);
1233         return true;
1234       }
1235 
1236       if (T == -1) {
1237         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1238         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1239         True = False;
1240         False = ZReg;
1241         CC = AArch64CC::getInvertedCondCode(CC);
1242         return true;
1243       }
1244     }
1245 
1246     if (FalseCst) {
1247       int64_t F = FalseCst->Value.getSExtValue();
1248       if (F == 1) {
1249         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1250         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1251         False = ZReg;
1252         return true;
1253       }
1254 
1255       if (F == -1) {
1256         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1257         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1258         False = ZReg;
1259         return true;
1260       }
1261     }
1262     return false;
1263   };
1264 
1265   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1266   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1267   Optimized |= TryOptSelectCst();
1268   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1269   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1270   return &*SelectInst;
1271 }
1272 
changeICMPPredToAArch64CC(CmpInst::Predicate P)1273 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1274   switch (P) {
1275   default:
1276     llvm_unreachable("Unknown condition code!");
1277   case CmpInst::ICMP_NE:
1278     return AArch64CC::NE;
1279   case CmpInst::ICMP_EQ:
1280     return AArch64CC::EQ;
1281   case CmpInst::ICMP_SGT:
1282     return AArch64CC::GT;
1283   case CmpInst::ICMP_SGE:
1284     return AArch64CC::GE;
1285   case CmpInst::ICMP_SLT:
1286     return AArch64CC::LT;
1287   case CmpInst::ICMP_SLE:
1288     return AArch64CC::LE;
1289   case CmpInst::ICMP_UGT:
1290     return AArch64CC::HI;
1291   case CmpInst::ICMP_UGE:
1292     return AArch64CC::HS;
1293   case CmpInst::ICMP_ULT:
1294     return AArch64CC::LO;
1295   case CmpInst::ICMP_ULE:
1296     return AArch64CC::LS;
1297   }
1298 }
1299 
1300 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1301 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1302                               MachineRegisterInfo &MRI) {
1303   assert(Reg.isValid() && "Expected valid register!");
1304   bool HasZext = false;
1305   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1306     unsigned Opc = MI->getOpcode();
1307 
1308     if (!MI->getOperand(0).isReg() ||
1309         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1310       break;
1311 
1312     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1313     //
1314     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1315     // on the truncated x is the same as the bit number on x.
1316     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1317         Opc == TargetOpcode::G_TRUNC) {
1318       if (Opc == TargetOpcode::G_ZEXT)
1319         HasZext = true;
1320 
1321       Register NextReg = MI->getOperand(1).getReg();
1322       // Did we find something worth folding?
1323       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1324         break;
1325 
1326       // NextReg is worth folding. Keep looking.
1327       Reg = NextReg;
1328       continue;
1329     }
1330 
1331     // Attempt to find a suitable operation with a constant on one side.
1332     Optional<uint64_t> C;
1333     Register TestReg;
1334     switch (Opc) {
1335     default:
1336       break;
1337     case TargetOpcode::G_AND:
1338     case TargetOpcode::G_XOR: {
1339       TestReg = MI->getOperand(1).getReg();
1340       Register ConstantReg = MI->getOperand(2).getReg();
1341       auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1342       if (!VRegAndVal) {
1343         // AND commutes, check the other side for a constant.
1344         // FIXME: Can we canonicalize the constant so that it's always on the
1345         // same side at some point earlier?
1346         std::swap(ConstantReg, TestReg);
1347         VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
1348       }
1349       if (VRegAndVal) {
1350         if (HasZext)
1351           C = VRegAndVal->Value.getZExtValue();
1352         else
1353           C = VRegAndVal->Value.getSExtValue();
1354       }
1355       break;
1356     }
1357     case TargetOpcode::G_ASHR:
1358     case TargetOpcode::G_LSHR:
1359     case TargetOpcode::G_SHL: {
1360       TestReg = MI->getOperand(1).getReg();
1361       auto VRegAndVal =
1362           getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1363       if (VRegAndVal)
1364         C = VRegAndVal->Value.getSExtValue();
1365       break;
1366     }
1367     }
1368 
1369     // Didn't find a constant or viable register. Bail out of the loop.
1370     if (!C || !TestReg.isValid())
1371       break;
1372 
1373     // We found a suitable instruction with a constant. Check to see if we can
1374     // walk through the instruction.
1375     Register NextReg;
1376     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1377     switch (Opc) {
1378     default:
1379       break;
1380     case TargetOpcode::G_AND:
1381       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1382       if ((*C >> Bit) & 1)
1383         NextReg = TestReg;
1384       break;
1385     case TargetOpcode::G_SHL:
1386       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1387       // the type of the register.
1388       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1389         NextReg = TestReg;
1390         Bit = Bit - *C;
1391       }
1392       break;
1393     case TargetOpcode::G_ASHR:
1394       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1395       // in x
1396       NextReg = TestReg;
1397       Bit = Bit + *C;
1398       if (Bit >= TestRegSize)
1399         Bit = TestRegSize - 1;
1400       break;
1401     case TargetOpcode::G_LSHR:
1402       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1403       if ((Bit + *C) < TestRegSize) {
1404         NextReg = TestReg;
1405         Bit = Bit + *C;
1406       }
1407       break;
1408     case TargetOpcode::G_XOR:
1409       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1410       // appropriate.
1411       //
1412       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1413       //
1414       // tbz x', b -> tbnz x, b
1415       //
1416       // Because x' only has the b-th bit set if x does not.
1417       if ((*C >> Bit) & 1)
1418         Invert = !Invert;
1419       NextReg = TestReg;
1420       break;
1421     }
1422 
1423     // Check if we found anything worth folding.
1424     if (!NextReg.isValid())
1425       return Reg;
1426     Reg = NextReg;
1427   }
1428 
1429   return Reg;
1430 }
1431 
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1432 MachineInstr *AArch64InstructionSelector::emitTestBit(
1433     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1434     MachineIRBuilder &MIB) const {
1435   assert(TestReg.isValid());
1436   assert(ProduceNonFlagSettingCondBr &&
1437          "Cannot emit TB(N)Z with speculation tracking!");
1438   MachineRegisterInfo &MRI = *MIB.getMRI();
1439 
1440   // Attempt to optimize the test bit by walking over instructions.
1441   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1442   LLT Ty = MRI.getType(TestReg);
1443   unsigned Size = Ty.getSizeInBits();
1444   assert(!Ty.isVector() && "Expected a scalar!");
1445   assert(Bit < 64 && "Bit is too large!");
1446 
1447   // When the test register is a 64-bit register, we have to narrow to make
1448   // TBNZW work.
1449   bool UseWReg = Bit < 32;
1450   unsigned NecessarySize = UseWReg ? 32 : 64;
1451   if (Size != NecessarySize)
1452     TestReg = moveScalarRegClass(
1453         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1454         MIB);
1455 
1456   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1457                                           {AArch64::TBZW, AArch64::TBNZW}};
1458   unsigned Opc = OpcTable[UseWReg][IsNegative];
1459   auto TestBitMI =
1460       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1461   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1462   return &*TestBitMI;
1463 }
1464 
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1465 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1466     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1467     MachineIRBuilder &MIB) const {
1468   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1469   // Given something like this:
1470   //
1471   //  %x = ...Something...
1472   //  %one = G_CONSTANT i64 1
1473   //  %zero = G_CONSTANT i64 0
1474   //  %and = G_AND %x, %one
1475   //  %cmp = G_ICMP intpred(ne), %and, %zero
1476   //  %cmp_trunc = G_TRUNC %cmp
1477   //  G_BRCOND %cmp_trunc, %bb.3
1478   //
1479   // We want to try and fold the AND into the G_BRCOND and produce either a
1480   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1481   //
1482   // In this case, we'd get
1483   //
1484   // TBNZ %x %bb.3
1485   //
1486 
1487   // Check if the AND has a constant on its RHS which we can use as a mask.
1488   // If it's a power of 2, then it's the same as checking a specific bit.
1489   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1490   auto MaybeBit = getConstantVRegValWithLookThrough(
1491       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1492   if (!MaybeBit)
1493     return false;
1494 
1495   int32_t Bit = MaybeBit->Value.exactLogBase2();
1496   if (Bit < 0)
1497     return false;
1498 
1499   Register TestReg = AndInst.getOperand(1).getReg();
1500 
1501   // Emit a TB(N)Z.
1502   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1503   return true;
1504 }
1505 
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1506 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1507                                                   bool IsNegative,
1508                                                   MachineBasicBlock *DestMBB,
1509                                                   MachineIRBuilder &MIB) const {
1510   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1511   MachineRegisterInfo &MRI = *MIB.getMRI();
1512   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1513              AArch64::GPRRegBankID &&
1514          "Expected GPRs only?");
1515   auto Ty = MRI.getType(CompareReg);
1516   unsigned Width = Ty.getSizeInBits();
1517   assert(!Ty.isVector() && "Expected scalar only?");
1518   assert(Width <= 64 && "Expected width to be at most 64?");
1519   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1520                                           {AArch64::CBNZW, AArch64::CBNZX}};
1521   unsigned Opc = OpcTable[IsNegative][Width == 64];
1522   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1523   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1524   return &*BranchMI;
1525 }
1526 
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1527 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1528     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1529   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1530   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1531   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1532   // totally clean.  Some of them require two branches to implement.
1533   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1534   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1535                 Pred);
1536   AArch64CC::CondCode CC1, CC2;
1537   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1538   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1539   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1540   if (CC2 != AArch64CC::AL)
1541     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1542   I.eraseFromParent();
1543   return true;
1544 }
1545 
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1546 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1547     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1548   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1549   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1550   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1551   //
1552   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1553   // instructions will not be produced, as they are conditional branch
1554   // instructions that do not set flags.
1555   if (!ProduceNonFlagSettingCondBr)
1556     return false;
1557 
1558   MachineRegisterInfo &MRI = *MIB.getMRI();
1559   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1560   auto Pred =
1561       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1562   Register LHS = ICmp.getOperand(2).getReg();
1563   Register RHS = ICmp.getOperand(3).getReg();
1564 
1565   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1566   auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1567   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1568 
1569   // When we can emit a TB(N)Z, prefer that.
1570   //
1571   // Handle non-commutative condition codes first.
1572   // Note that we don't want to do this when we have a G_AND because it can
1573   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1574   if (VRegAndVal && !AndInst) {
1575     int64_t C = VRegAndVal->Value.getSExtValue();
1576 
1577     // When we have a greater-than comparison, we can just test if the msb is
1578     // zero.
1579     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1580       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1581       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1582       I.eraseFromParent();
1583       return true;
1584     }
1585 
1586     // When we have a less than comparison, we can just test if the msb is not
1587     // zero.
1588     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1589       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1590       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1591       I.eraseFromParent();
1592       return true;
1593     }
1594   }
1595 
1596   // Attempt to handle commutative condition codes. Right now, that's only
1597   // eq/ne.
1598   if (ICmpInst::isEquality(Pred)) {
1599     if (!VRegAndVal) {
1600       std::swap(RHS, LHS);
1601       VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
1602       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1603     }
1604 
1605     if (VRegAndVal && VRegAndVal->Value == 0) {
1606       // If there's a G_AND feeding into this branch, try to fold it away by
1607       // emitting a TB(N)Z instead.
1608       //
1609       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1610       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1611       // would be redundant.
1612       if (AndInst &&
1613           tryOptAndIntoCompareBranch(
1614               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1615         I.eraseFromParent();
1616         return true;
1617       }
1618 
1619       // Otherwise, try to emit a CB(N)Z instead.
1620       auto LHSTy = MRI.getType(LHS);
1621       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1622         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1623         I.eraseFromParent();
1624         return true;
1625       }
1626     }
1627   }
1628 
1629   return false;
1630 }
1631 
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1632 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1633     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1634   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1635   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1636   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1637     return true;
1638 
1639   // Couldn't optimize. Emit a compare + a Bcc.
1640   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1641   auto PredOp = ICmp.getOperand(1);
1642   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1643   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1644       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1645   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1646   I.eraseFromParent();
1647   return true;
1648 }
1649 
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1650 bool AArch64InstructionSelector::selectCompareBranch(
1651     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1652   Register CondReg = I.getOperand(0).getReg();
1653   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1654   if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
1655     CondReg = CCMI->getOperand(1).getReg();
1656     CCMI = MRI.getVRegDef(CondReg);
1657   }
1658 
1659   // Try to select the G_BRCOND using whatever is feeding the condition if
1660   // possible.
1661   unsigned CCMIOpc = CCMI->getOpcode();
1662   if (CCMIOpc == TargetOpcode::G_FCMP)
1663     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1664   if (CCMIOpc == TargetOpcode::G_ICMP)
1665     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1666 
1667   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1668   // instructions will not be produced, as they are conditional branch
1669   // instructions that do not set flags.
1670   if (ProduceNonFlagSettingCondBr) {
1671     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1672                 I.getOperand(1).getMBB(), MIB);
1673     I.eraseFromParent();
1674     return true;
1675   }
1676 
1677   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1678   auto TstMI =
1679       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1680   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1681   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1682                  .addImm(AArch64CC::EQ)
1683                  .addMBB(I.getOperand(1).getMBB());
1684   I.eraseFromParent();
1685   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1686 }
1687 
1688 /// Returns the element immediate value of a vector shift operand if found.
1689 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1690 static Optional<int64_t> getVectorShiftImm(Register Reg,
1691                                            MachineRegisterInfo &MRI) {
1692   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1693   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1694   assert(OpMI && "Expected to find a vreg def for vector shift operand");
1695   return getAArch64VectorSplatScalar(*OpMI, MRI);
1696 }
1697 
1698 /// Matches and returns the shift immediate value for a SHL instruction given
1699 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1700 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1701   Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1702   if (!ShiftImm)
1703     return None;
1704   // Check the immediate is in range for a SHL.
1705   int64_t Imm = *ShiftImm;
1706   if (Imm < 0)
1707     return None;
1708   switch (SrcTy.getElementType().getSizeInBits()) {
1709   default:
1710     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1711     return None;
1712   case 8:
1713     if (Imm > 7)
1714       return None;
1715     break;
1716   case 16:
1717     if (Imm > 15)
1718       return None;
1719     break;
1720   case 32:
1721     if (Imm > 31)
1722       return None;
1723     break;
1724   case 64:
1725     if (Imm > 63)
1726       return None;
1727     break;
1728   }
1729   return Imm;
1730 }
1731 
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1732 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1733                                                  MachineRegisterInfo &MRI) {
1734   assert(I.getOpcode() == TargetOpcode::G_SHL);
1735   Register DstReg = I.getOperand(0).getReg();
1736   const LLT Ty = MRI.getType(DstReg);
1737   Register Src1Reg = I.getOperand(1).getReg();
1738   Register Src2Reg = I.getOperand(2).getReg();
1739 
1740   if (!Ty.isVector())
1741     return false;
1742 
1743   // Check if we have a vector of constants on RHS that we can select as the
1744   // immediate form.
1745   Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1746 
1747   unsigned Opc = 0;
1748   if (Ty == LLT::fixed_vector(2, 64)) {
1749     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1750   } else if (Ty == LLT::fixed_vector(4, 32)) {
1751     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1752   } else if (Ty == LLT::fixed_vector(2, 32)) {
1753     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1754   } else if (Ty == LLT::fixed_vector(4, 16)) {
1755     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1756   } else if (Ty == LLT::fixed_vector(8, 16)) {
1757     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1758   } else if (Ty == LLT::fixed_vector(16, 8)) {
1759     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1760   } else if (Ty == LLT::fixed_vector(8, 8)) {
1761     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1762   } else {
1763     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1764     return false;
1765   }
1766 
1767   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1768   if (ImmVal)
1769     Shl.addImm(*ImmVal);
1770   else
1771     Shl.addUse(Src2Reg);
1772   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1773   I.eraseFromParent();
1774   return true;
1775 }
1776 
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1777 bool AArch64InstructionSelector::selectVectorAshrLshr(
1778     MachineInstr &I, MachineRegisterInfo &MRI) {
1779   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1780          I.getOpcode() == TargetOpcode::G_LSHR);
1781   Register DstReg = I.getOperand(0).getReg();
1782   const LLT Ty = MRI.getType(DstReg);
1783   Register Src1Reg = I.getOperand(1).getReg();
1784   Register Src2Reg = I.getOperand(2).getReg();
1785 
1786   if (!Ty.isVector())
1787     return false;
1788 
1789   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1790 
1791   // We expect the immediate case to be lowered in the PostLegalCombiner to
1792   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1793 
1794   // There is not a shift right register instruction, but the shift left
1795   // register instruction takes a signed value, where negative numbers specify a
1796   // right shift.
1797 
1798   unsigned Opc = 0;
1799   unsigned NegOpc = 0;
1800   const TargetRegisterClass *RC =
1801       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
1802   if (Ty == LLT::fixed_vector(2, 64)) {
1803     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1804     NegOpc = AArch64::NEGv2i64;
1805   } else if (Ty == LLT::fixed_vector(4, 32)) {
1806     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1807     NegOpc = AArch64::NEGv4i32;
1808   } else if (Ty == LLT::fixed_vector(2, 32)) {
1809     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1810     NegOpc = AArch64::NEGv2i32;
1811   } else if (Ty == LLT::fixed_vector(4, 16)) {
1812     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1813     NegOpc = AArch64::NEGv4i16;
1814   } else if (Ty == LLT::fixed_vector(8, 16)) {
1815     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1816     NegOpc = AArch64::NEGv8i16;
1817   } else if (Ty == LLT::fixed_vector(16, 8)) {
1818     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1819     NegOpc = AArch64::NEGv16i8;
1820   } else if (Ty == LLT::fixed_vector(8, 8)) {
1821     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1822     NegOpc = AArch64::NEGv8i8;
1823   } else {
1824     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1825     return false;
1826   }
1827 
1828   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1829   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1830   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1831   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1832   I.eraseFromParent();
1833   return true;
1834 }
1835 
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1836 bool AArch64InstructionSelector::selectVaStartAAPCS(
1837     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1838   return false;
1839 }
1840 
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1841 bool AArch64InstructionSelector::selectVaStartDarwin(
1842     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1843   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1844   Register ListReg = I.getOperand(0).getReg();
1845 
1846   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1847 
1848   auto MIB =
1849       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1850           .addDef(ArgsAddrReg)
1851           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1852           .addImm(0)
1853           .addImm(0);
1854 
1855   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1856 
1857   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1858             .addUse(ArgsAddrReg)
1859             .addUse(ListReg)
1860             .addImm(0)
1861             .addMemOperand(*I.memoperands_begin());
1862 
1863   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1864   I.eraseFromParent();
1865   return true;
1866 }
1867 
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)1868 void AArch64InstructionSelector::materializeLargeCMVal(
1869     MachineInstr &I, const Value *V, unsigned OpFlags) {
1870   MachineBasicBlock &MBB = *I.getParent();
1871   MachineFunction &MF = *MBB.getParent();
1872   MachineRegisterInfo &MRI = MF.getRegInfo();
1873 
1874   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1875   MovZ->addOperand(MF, I.getOperand(1));
1876   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1877                                      AArch64II::MO_NC);
1878   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1879   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1880 
1881   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1882                        Register ForceDstReg) {
1883     Register DstReg = ForceDstReg
1884                           ? ForceDstReg
1885                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1886     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1887     if (auto *GV = dyn_cast<GlobalValue>(V)) {
1888       MovI->addOperand(MF, MachineOperand::CreateGA(
1889                                GV, MovZ->getOperand(1).getOffset(), Flags));
1890     } else {
1891       MovI->addOperand(
1892           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1893                                        MovZ->getOperand(1).getOffset(), Flags));
1894     }
1895     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1896     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1897     return DstReg;
1898   };
1899   Register DstReg = BuildMovK(MovZ.getReg(0),
1900                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1901   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1902   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1903 }
1904 
preISelLower(MachineInstr & I)1905 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1906   MachineBasicBlock &MBB = *I.getParent();
1907   MachineFunction &MF = *MBB.getParent();
1908   MachineRegisterInfo &MRI = MF.getRegInfo();
1909 
1910   switch (I.getOpcode()) {
1911   case TargetOpcode::G_SHL:
1912   case TargetOpcode::G_ASHR:
1913   case TargetOpcode::G_LSHR: {
1914     // These shifts are legalized to have 64 bit shift amounts because we want
1915     // to take advantage of the existing imported selection patterns that assume
1916     // the immediates are s64s. However, if the shifted type is 32 bits and for
1917     // some reason we receive input GMIR that has an s64 shift amount that's not
1918     // a G_CONSTANT, insert a truncate so that we can still select the s32
1919     // register-register variant.
1920     Register SrcReg = I.getOperand(1).getReg();
1921     Register ShiftReg = I.getOperand(2).getReg();
1922     const LLT ShiftTy = MRI.getType(ShiftReg);
1923     const LLT SrcTy = MRI.getType(SrcReg);
1924     if (SrcTy.isVector())
1925       return false;
1926     assert(!ShiftTy.isVector() && "unexpected vector shift ty");
1927     if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
1928       return false;
1929     auto *AmtMI = MRI.getVRegDef(ShiftReg);
1930     assert(AmtMI && "could not find a vreg definition for shift amount");
1931     if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
1932       // Insert a subregister copy to implement a 64->32 trunc
1933       auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
1934                        .addReg(ShiftReg, 0, AArch64::sub_32);
1935       MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
1936       I.getOperand(2).setReg(Trunc.getReg(0));
1937     }
1938     return true;
1939   }
1940   case TargetOpcode::G_STORE: {
1941     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1942     MachineOperand &SrcOp = I.getOperand(0);
1943     if (MRI.getType(SrcOp.getReg()).isPointer()) {
1944       // Allow matching with imported patterns for stores of pointers. Unlike
1945       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1946       // and constrain.
1947       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1948       Register NewSrc = Copy.getReg(0);
1949       SrcOp.setReg(NewSrc);
1950       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1951       Changed = true;
1952     }
1953     return Changed;
1954   }
1955   case TargetOpcode::G_PTR_ADD:
1956     return convertPtrAddToAdd(I, MRI);
1957   case TargetOpcode::G_LOAD: {
1958     // For scalar loads of pointers, we try to convert the dest type from p0
1959     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1960     // conversion, this should be ok because all users should have been
1961     // selected already, so the type doesn't matter for them.
1962     Register DstReg = I.getOperand(0).getReg();
1963     const LLT DstTy = MRI.getType(DstReg);
1964     if (!DstTy.isPointer())
1965       return false;
1966     MRI.setType(DstReg, LLT::scalar(64));
1967     return true;
1968   }
1969   case AArch64::G_DUP: {
1970     // Convert the type from p0 to s64 to help selection.
1971     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1972     if (!DstTy.getElementType().isPointer())
1973       return false;
1974     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1975     MRI.setType(I.getOperand(0).getReg(),
1976                 DstTy.changeElementType(LLT::scalar(64)));
1977     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
1978     I.getOperand(1).setReg(NewSrc.getReg(0));
1979     return true;
1980   }
1981   case TargetOpcode::G_UITOFP:
1982   case TargetOpcode::G_SITOFP: {
1983     // If both source and destination regbanks are FPR, then convert the opcode
1984     // to G_SITOF so that the importer can select it to an fpr variant.
1985     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
1986     // copy.
1987     Register SrcReg = I.getOperand(1).getReg();
1988     LLT SrcTy = MRI.getType(SrcReg);
1989     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1990     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
1991       return false;
1992 
1993     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
1994       if (I.getOpcode() == TargetOpcode::G_SITOFP)
1995         I.setDesc(TII.get(AArch64::G_SITOF));
1996       else
1997         I.setDesc(TII.get(AArch64::G_UITOF));
1998       return true;
1999     }
2000     return false;
2001   }
2002   default:
2003     return false;
2004   }
2005 }
2006 
2007 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2008 /// them to a standard G_ADD with a COPY on the source.
2009 ///
2010 /// The motivation behind this is to expose the add semantics to the imported
2011 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2012 /// because the selector works bottom up, uses before defs. By the time we
2013 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2014 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)2015 bool AArch64InstructionSelector::convertPtrAddToAdd(
2016     MachineInstr &I, MachineRegisterInfo &MRI) {
2017   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2018   Register DstReg = I.getOperand(0).getReg();
2019   Register AddOp1Reg = I.getOperand(1).getReg();
2020   const LLT PtrTy = MRI.getType(DstReg);
2021   if (PtrTy.getAddressSpace() != 0)
2022     return false;
2023 
2024   const LLT CastPtrTy =
2025       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2026   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2027   // Set regbanks on the registers.
2028   if (PtrTy.isVector())
2029     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2030   else
2031     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2032 
2033   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2034   // %dst(intty) = G_ADD %intbase, off
2035   I.setDesc(TII.get(TargetOpcode::G_ADD));
2036   MRI.setType(DstReg, CastPtrTy);
2037   I.getOperand(1).setReg(PtrToInt.getReg(0));
2038   if (!select(*PtrToInt)) {
2039     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2040     return false;
2041   }
2042 
2043   // Also take the opportunity here to try to do some optimization.
2044   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2045   Register NegatedReg;
2046   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2047     return true;
2048   I.getOperand(2).setReg(NegatedReg);
2049   I.setDesc(TII.get(TargetOpcode::G_SUB));
2050   return true;
2051 }
2052 
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)2053 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2054                                                 MachineRegisterInfo &MRI) {
2055   // We try to match the immediate variant of LSL, which is actually an alias
2056   // for a special case of UBFM. Otherwise, we fall back to the imported
2057   // selector which will match the register variant.
2058   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2059   const auto &MO = I.getOperand(2);
2060   auto VRegAndVal = getConstantVRegVal(MO.getReg(), MRI);
2061   if (!VRegAndVal)
2062     return false;
2063 
2064   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2065   if (DstTy.isVector())
2066     return false;
2067   bool Is64Bit = DstTy.getSizeInBits() == 64;
2068   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2069   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2070 
2071   if (!Imm1Fn || !Imm2Fn)
2072     return false;
2073 
2074   auto NewI =
2075       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2076                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2077 
2078   for (auto &RenderFn : *Imm1Fn)
2079     RenderFn(NewI);
2080   for (auto &RenderFn : *Imm2Fn)
2081     RenderFn(NewI);
2082 
2083   I.eraseFromParent();
2084   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2085 }
2086 
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2087 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2088     MachineInstr &I, MachineRegisterInfo &MRI) {
2089   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2090   // If we're storing a scalar, it doesn't matter what register bank that
2091   // scalar is on. All that matters is the size.
2092   //
2093   // So, if we see something like this (with a 32-bit scalar as an example):
2094   //
2095   // %x:gpr(s32) = ... something ...
2096   // %y:fpr(s32) = COPY %x:gpr(s32)
2097   // G_STORE %y:fpr(s32)
2098   //
2099   // We can fix this up into something like this:
2100   //
2101   // G_STORE %x:gpr(s32)
2102   //
2103   // And then continue the selection process normally.
2104   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2105   if (!DefDstReg.isValid())
2106     return false;
2107   LLT DefDstTy = MRI.getType(DefDstReg);
2108   Register StoreSrcReg = I.getOperand(0).getReg();
2109   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2110 
2111   // If we get something strange like a physical register, then we shouldn't
2112   // go any further.
2113   if (!DefDstTy.isValid())
2114     return false;
2115 
2116   // Are the source and dst types the same size?
2117   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2118     return false;
2119 
2120   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2121       RBI.getRegBank(DefDstReg, MRI, TRI))
2122     return false;
2123 
2124   // We have a cross-bank copy, which is entering a store. Let's fold it.
2125   I.getOperand(0).setReg(DefDstReg);
2126   return true;
2127 }
2128 
earlySelect(MachineInstr & I)2129 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2130   assert(I.getParent() && "Instruction should be in a basic block!");
2131   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2132 
2133   MachineBasicBlock &MBB = *I.getParent();
2134   MachineFunction &MF = *MBB.getParent();
2135   MachineRegisterInfo &MRI = MF.getRegInfo();
2136 
2137   switch (I.getOpcode()) {
2138   case AArch64::G_DUP: {
2139     // Before selecting a DUP instruction, check if it is better selected as a
2140     // MOV or load from a constant pool.
2141     Register Src = I.getOperand(1).getReg();
2142     auto ValAndVReg = getConstantVRegValWithLookThrough(Src, MRI);
2143     if (!ValAndVReg)
2144       return false;
2145     LLVMContext &Ctx = MF.getFunction().getContext();
2146     Register Dst = I.getOperand(0).getReg();
2147     auto *CV = ConstantDataVector::getSplat(
2148         MRI.getType(Dst).getNumElements(),
2149         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2150                          ValAndVReg->Value));
2151     if (!emitConstantVector(Dst, CV, MIB, MRI))
2152       return false;
2153     I.eraseFromParent();
2154     return true;
2155   }
2156   case TargetOpcode::G_BR: {
2157     // If the branch jumps to the fallthrough block, don't bother emitting it.
2158     // Only do this for -O0 for a good code size improvement, because when
2159     // optimizations are enabled we want to leave this choice to
2160     // MachineBlockPlacement.
2161     bool EnableOpt = MF.getTarget().getOptLevel() != CodeGenOpt::None;
2162     if (EnableOpt || !MBB.isLayoutSuccessor(I.getOperand(0).getMBB()))
2163       return false;
2164     I.eraseFromParent();
2165     return true;
2166   }
2167   case TargetOpcode::G_SHL:
2168     return earlySelectSHL(I, MRI);
2169   case TargetOpcode::G_CONSTANT: {
2170     bool IsZero = false;
2171     if (I.getOperand(1).isCImm())
2172       IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2173     else if (I.getOperand(1).isImm())
2174       IsZero = I.getOperand(1).getImm() == 0;
2175 
2176     if (!IsZero)
2177       return false;
2178 
2179     Register DefReg = I.getOperand(0).getReg();
2180     LLT Ty = MRI.getType(DefReg);
2181     if (Ty.getSizeInBits() == 64) {
2182       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2183       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2184     } else if (Ty.getSizeInBits() == 32) {
2185       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2186       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2187     } else
2188       return false;
2189 
2190     I.setDesc(TII.get(TargetOpcode::COPY));
2191     return true;
2192   }
2193 
2194   case TargetOpcode::G_ADD: {
2195     // Check if this is being fed by a G_ICMP on either side.
2196     //
2197     // (cmp pred, x, y) + z
2198     //
2199     // In the above case, when the cmp is true, we increment z by 1. So, we can
2200     // fold the add into the cset for the cmp by using cinc.
2201     //
2202     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2203     Register X = I.getOperand(1).getReg();
2204 
2205     // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
2206     // early if we see it.
2207     LLT Ty = MRI.getType(X);
2208     if (Ty.isVector() || Ty.getSizeInBits() != 32)
2209       return false;
2210 
2211     Register CmpReg = I.getOperand(2).getReg();
2212     MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2213     if (!Cmp) {
2214       std::swap(X, CmpReg);
2215       Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2216       if (!Cmp)
2217         return false;
2218     }
2219     auto Pred =
2220         static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
2221     emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
2222                        Cmp->getOperand(1), MIB);
2223     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
2224     I.eraseFromParent();
2225     return true;
2226   }
2227   case TargetOpcode::G_OR: {
2228     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2229     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2230     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2231     Register Dst = I.getOperand(0).getReg();
2232     LLT Ty = MRI.getType(Dst);
2233 
2234     if (!Ty.isScalar())
2235       return false;
2236 
2237     unsigned Size = Ty.getSizeInBits();
2238     if (Size != 32 && Size != 64)
2239       return false;
2240 
2241     Register ShiftSrc;
2242     int64_t ShiftImm;
2243     Register MaskSrc;
2244     int64_t MaskImm;
2245     if (!mi_match(
2246             Dst, MRI,
2247             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2248                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2249       return false;
2250 
2251     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2252       return false;
2253 
2254     int64_t Immr = Size - ShiftImm;
2255     int64_t Imms = Size - ShiftImm - 1;
2256     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2257     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2258     I.eraseFromParent();
2259     return true;
2260   }
2261   default:
2262     return false;
2263   }
2264 }
2265 
select(MachineInstr & I)2266 bool AArch64InstructionSelector::select(MachineInstr &I) {
2267   assert(I.getParent() && "Instruction should be in a basic block!");
2268   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2269 
2270   MachineBasicBlock &MBB = *I.getParent();
2271   MachineFunction &MF = *MBB.getParent();
2272   MachineRegisterInfo &MRI = MF.getRegInfo();
2273 
2274   const AArch64Subtarget *Subtarget =
2275       &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
2276   if (Subtarget->requiresStrictAlign()) {
2277     // We don't support this feature yet.
2278     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2279     return false;
2280   }
2281 
2282   MIB.setInstrAndDebugLoc(I);
2283 
2284   unsigned Opcode = I.getOpcode();
2285   // G_PHI requires same handling as PHI
2286   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2287     // Certain non-generic instructions also need some special handling.
2288 
2289     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2290       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2291 
2292     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2293       const Register DefReg = I.getOperand(0).getReg();
2294       const LLT DefTy = MRI.getType(DefReg);
2295 
2296       const RegClassOrRegBank &RegClassOrBank =
2297         MRI.getRegClassOrRegBank(DefReg);
2298 
2299       const TargetRegisterClass *DefRC
2300         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2301       if (!DefRC) {
2302         if (!DefTy.isValid()) {
2303           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2304           return false;
2305         }
2306         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2307         DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
2308         if (!DefRC) {
2309           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2310           return false;
2311         }
2312       }
2313 
2314       I.setDesc(TII.get(TargetOpcode::PHI));
2315 
2316       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2317     }
2318 
2319     if (I.isCopy())
2320       return selectCopy(I, TII, MRI, TRI, RBI);
2321 
2322     return true;
2323   }
2324 
2325 
2326   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2327     LLVM_DEBUG(
2328         dbgs() << "Generic instruction has unexpected implicit operands\n");
2329     return false;
2330   }
2331 
2332   // Try to do some lowering before we start instruction selecting. These
2333   // lowerings are purely transformations on the input G_MIR and so selection
2334   // must continue after any modification of the instruction.
2335   if (preISelLower(I)) {
2336     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2337   }
2338 
2339   // There may be patterns where the importer can't deal with them optimally,
2340   // but does select it to a suboptimal sequence so our custom C++ selection
2341   // code later never has a chance to work on it. Therefore, we have an early
2342   // selection attempt here to give priority to certain selection routines
2343   // over the imported ones.
2344   if (earlySelect(I))
2345     return true;
2346 
2347   if (selectImpl(I, *CoverageInfo))
2348     return true;
2349 
2350   LLT Ty =
2351       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2352 
2353   switch (Opcode) {
2354   case TargetOpcode::G_SBFX:
2355   case TargetOpcode::G_UBFX: {
2356     static const unsigned OpcTable[2][2] = {
2357         {AArch64::UBFMWri, AArch64::UBFMXri},
2358         {AArch64::SBFMWri, AArch64::SBFMXri}};
2359     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2360     unsigned Size = Ty.getSizeInBits();
2361     unsigned Opc = OpcTable[IsSigned][Size == 64];
2362     auto Cst1 =
2363         getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2364     assert(Cst1 && "Should have gotten a constant for src 1?");
2365     auto Cst2 =
2366         getConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2367     assert(Cst2 && "Should have gotten a constant for src 2?");
2368     auto LSB = Cst1->Value.getZExtValue();
2369     auto Width = Cst2->Value.getZExtValue();
2370     auto BitfieldInst =
2371         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2372             .addImm(LSB)
2373             .addImm(LSB + Width - 1);
2374     I.eraseFromParent();
2375     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2376   }
2377   case TargetOpcode::G_BRCOND:
2378     return selectCompareBranch(I, MF, MRI);
2379 
2380   case TargetOpcode::G_BRINDIRECT: {
2381     I.setDesc(TII.get(AArch64::BR));
2382     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2383   }
2384 
2385   case TargetOpcode::G_BRJT:
2386     return selectBrJT(I, MRI);
2387 
2388   case AArch64::G_ADD_LOW: {
2389     // This op may have been separated from it's ADRP companion by the localizer
2390     // or some other code motion pass. Given that many CPUs will try to
2391     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2392     // which will later be expanded into an ADRP+ADD pair after scheduling.
2393     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2394     if (BaseMI->getOpcode() != AArch64::ADRP) {
2395       I.setDesc(TII.get(AArch64::ADDXri));
2396       I.addOperand(MachineOperand::CreateImm(0));
2397       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2398     }
2399     assert(TM.getCodeModel() == CodeModel::Small &&
2400            "Expected small code model");
2401     auto Op1 = BaseMI->getOperand(1);
2402     auto Op2 = I.getOperand(2);
2403     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2404                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2405                                          Op1.getTargetFlags())
2406                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2407                                          Op2.getTargetFlags());
2408     I.eraseFromParent();
2409     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2410   }
2411 
2412   case TargetOpcode::G_BSWAP: {
2413     // Handle vector types for G_BSWAP directly.
2414     Register DstReg = I.getOperand(0).getReg();
2415     LLT DstTy = MRI.getType(DstReg);
2416 
2417     // We should only get vector types here; everything else is handled by the
2418     // importer right now.
2419     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2420       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2421       return false;
2422     }
2423 
2424     // Only handle 4 and 2 element vectors for now.
2425     // TODO: 16-bit elements.
2426     unsigned NumElts = DstTy.getNumElements();
2427     if (NumElts != 4 && NumElts != 2) {
2428       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2429       return false;
2430     }
2431 
2432     // Choose the correct opcode for the supported types. Right now, that's
2433     // v2s32, v4s32, and v2s64.
2434     unsigned Opc = 0;
2435     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2436     if (EltSize == 32)
2437       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2438                                           : AArch64::REV32v16i8;
2439     else if (EltSize == 64)
2440       Opc = AArch64::REV64v16i8;
2441 
2442     // We should always get something by the time we get here...
2443     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2444 
2445     I.setDesc(TII.get(Opc));
2446     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2447   }
2448 
2449   case TargetOpcode::G_FCONSTANT:
2450   case TargetOpcode::G_CONSTANT: {
2451     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2452 
2453     const LLT s8 = LLT::scalar(8);
2454     const LLT s16 = LLT::scalar(16);
2455     const LLT s32 = LLT::scalar(32);
2456     const LLT s64 = LLT::scalar(64);
2457     const LLT s128 = LLT::scalar(128);
2458     const LLT p0 = LLT::pointer(0, 64);
2459 
2460     const Register DefReg = I.getOperand(0).getReg();
2461     const LLT DefTy = MRI.getType(DefReg);
2462     const unsigned DefSize = DefTy.getSizeInBits();
2463     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2464 
2465     // FIXME: Redundant check, but even less readable when factored out.
2466     if (isFP) {
2467       if (Ty != s32 && Ty != s64 && Ty != s128) {
2468         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2469                           << " constant, expected: " << s32 << " or " << s64
2470                           << " or " << s128 << '\n');
2471         return false;
2472       }
2473 
2474       if (RB.getID() != AArch64::FPRRegBankID) {
2475         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2476                           << " constant on bank: " << RB
2477                           << ", expected: FPR\n");
2478         return false;
2479       }
2480 
2481       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2482       // can be sure tablegen works correctly and isn't rescued by this code.
2483       // 0.0 is not covered by tablegen for FP128. So we will handle this
2484       // scenario in the code here.
2485       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2486         return false;
2487     } else {
2488       // s32 and s64 are covered by tablegen.
2489       if (Ty != p0 && Ty != s8 && Ty != s16) {
2490         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2491                           << " constant, expected: " << s32 << ", " << s64
2492                           << ", or " << p0 << '\n');
2493         return false;
2494       }
2495 
2496       if (RB.getID() != AArch64::GPRRegBankID) {
2497         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2498                           << " constant on bank: " << RB
2499                           << ", expected: GPR\n");
2500         return false;
2501       }
2502     }
2503 
2504     // We allow G_CONSTANT of types < 32b.
2505     const unsigned MovOpc =
2506         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2507 
2508     if (isFP) {
2509       // Either emit a FMOV, or emit a copy to emit a normal mov.
2510       const TargetRegisterClass &GPRRC =
2511           DefSize == 32 ? AArch64::GPR32RegClass : AArch64::GPR64RegClass;
2512       const TargetRegisterClass &FPRRC =
2513           DefSize == 32 ? AArch64::FPR32RegClass
2514                         : (DefSize == 64 ? AArch64::FPR64RegClass
2515                                          : AArch64::FPR128RegClass);
2516 
2517       // For 64b values, emit a constant pool load instead.
2518       // For s32, use a cp load if we have optsize/minsize.
2519       if (DefSize == 64 || DefSize == 128 ||
2520           (DefSize == 32 && shouldOptForSize(&MF))) {
2521         auto *FPImm = I.getOperand(1).getFPImm();
2522         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2523         if (!LoadMI) {
2524           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2525           return false;
2526         }
2527         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2528         I.eraseFromParent();
2529         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2530       }
2531 
2532       // Nope. Emit a copy and use a normal mov instead.
2533       const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
2534       MachineOperand &RegOp = I.getOperand(0);
2535       RegOp.setReg(DefGPRReg);
2536       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2537       MIB.buildCopy({DefReg}, {DefGPRReg});
2538 
2539       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2540         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2541         return false;
2542       }
2543 
2544       MachineOperand &ImmOp = I.getOperand(1);
2545       // FIXME: Is going through int64_t always correct?
2546       ImmOp.ChangeToImmediate(
2547           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2548     } else if (I.getOperand(1).isCImm()) {
2549       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2550       I.getOperand(1).ChangeToImmediate(Val);
2551     } else if (I.getOperand(1).isImm()) {
2552       uint64_t Val = I.getOperand(1).getImm();
2553       I.getOperand(1).ChangeToImmediate(Val);
2554     }
2555 
2556     I.setDesc(TII.get(MovOpc));
2557     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2558     return true;
2559   }
2560   case TargetOpcode::G_EXTRACT: {
2561     Register DstReg = I.getOperand(0).getReg();
2562     Register SrcReg = I.getOperand(1).getReg();
2563     LLT SrcTy = MRI.getType(SrcReg);
2564     LLT DstTy = MRI.getType(DstReg);
2565     (void)DstTy;
2566     unsigned SrcSize = SrcTy.getSizeInBits();
2567 
2568     if (SrcTy.getSizeInBits() > 64) {
2569       // This should be an extract of an s128, which is like a vector extract.
2570       if (SrcTy.getSizeInBits() != 128)
2571         return false;
2572       // Only support extracting 64 bits from an s128 at the moment.
2573       if (DstTy.getSizeInBits() != 64)
2574         return false;
2575 
2576       unsigned Offset = I.getOperand(2).getImm();
2577       if (Offset % 64 != 0)
2578         return false;
2579 
2580       // Check we have the right regbank always.
2581       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2582       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2583       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2584 
2585       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2586         MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2587             .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2588         I.eraseFromParent();
2589         return true;
2590       }
2591 
2592       // Emit the same code as a vector extract.
2593       // Offset must be a multiple of 64.
2594       unsigned LaneIdx = Offset / 64;
2595       MachineInstr *Extract = emitExtractVectorElt(
2596           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2597       if (!Extract)
2598         return false;
2599       I.eraseFromParent();
2600       return true;
2601     }
2602 
2603     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2604     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2605                                       Ty.getSizeInBits() - 1);
2606 
2607     if (SrcSize < 64) {
2608       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2609              "unexpected G_EXTRACT types");
2610       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2611     }
2612 
2613     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2614     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2615     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2616         .addReg(DstReg, 0, AArch64::sub_32);
2617     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2618                                  AArch64::GPR32RegClass, MRI);
2619     I.getOperand(0).setReg(DstReg);
2620 
2621     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2622   }
2623 
2624   case TargetOpcode::G_INSERT: {
2625     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2626     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2627     unsigned DstSize = DstTy.getSizeInBits();
2628     // Larger inserts are vectors, same-size ones should be something else by
2629     // now (split up or turned into COPYs).
2630     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2631       return false;
2632 
2633     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2634     unsigned LSB = I.getOperand(3).getImm();
2635     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2636     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2637     MachineInstrBuilder(MF, I).addImm(Width - 1);
2638 
2639     if (DstSize < 64) {
2640       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2641              "unexpected G_INSERT types");
2642       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2643     }
2644 
2645     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2646     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2647             TII.get(AArch64::SUBREG_TO_REG))
2648         .addDef(SrcReg)
2649         .addImm(0)
2650         .addUse(I.getOperand(2).getReg())
2651         .addImm(AArch64::sub_32);
2652     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2653                                  AArch64::GPR32RegClass, MRI);
2654     I.getOperand(2).setReg(SrcReg);
2655 
2656     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2657   }
2658   case TargetOpcode::G_FRAME_INDEX: {
2659     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2660     if (Ty != LLT::pointer(0, 64)) {
2661       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2662                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2663       return false;
2664     }
2665     I.setDesc(TII.get(AArch64::ADDXri));
2666 
2667     // MOs for a #0 shifted immediate.
2668     I.addOperand(MachineOperand::CreateImm(0));
2669     I.addOperand(MachineOperand::CreateImm(0));
2670 
2671     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2672   }
2673 
2674   case TargetOpcode::G_GLOBAL_VALUE: {
2675     auto GV = I.getOperand(1).getGlobal();
2676     if (GV->isThreadLocal())
2677       return selectTLSGlobalValue(I, MRI);
2678 
2679     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2680     if (OpFlags & AArch64II::MO_GOT) {
2681       I.setDesc(TII.get(AArch64::LOADgot));
2682       I.getOperand(1).setTargetFlags(OpFlags);
2683     } else if (TM.getCodeModel() == CodeModel::Large) {
2684       // Materialize the global using movz/movk instructions.
2685       materializeLargeCMVal(I, GV, OpFlags);
2686       I.eraseFromParent();
2687       return true;
2688     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2689       I.setDesc(TII.get(AArch64::ADR));
2690       I.getOperand(1).setTargetFlags(OpFlags);
2691     } else {
2692       I.setDesc(TII.get(AArch64::MOVaddr));
2693       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2694       MachineInstrBuilder MIB(MF, I);
2695       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2696                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2697     }
2698     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2699   }
2700 
2701   case TargetOpcode::G_ZEXTLOAD:
2702   case TargetOpcode::G_LOAD:
2703   case TargetOpcode::G_STORE: {
2704     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2705     LLT PtrTy = MRI.getType(I.getOperand(1).getReg());
2706 
2707     if (PtrTy != LLT::pointer(0, 64)) {
2708       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2709                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2710       return false;
2711     }
2712 
2713     auto &MemOp = **I.memoperands_begin();
2714     uint64_t MemSizeInBytes = MemOp.getSize();
2715     unsigned MemSizeInBits = MemSizeInBytes * 8;
2716     AtomicOrdering Order = MemOp.getSuccessOrdering();
2717 
2718     // Need special instructions for atomics that affect ordering.
2719     if (Order != AtomicOrdering::NotAtomic &&
2720         Order != AtomicOrdering::Unordered &&
2721         Order != AtomicOrdering::Monotonic) {
2722       assert(I.getOpcode() != TargetOpcode::G_ZEXTLOAD);
2723       if (MemSizeInBytes > 64)
2724         return false;
2725 
2726       if (I.getOpcode() == TargetOpcode::G_LOAD) {
2727         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
2728                                      AArch64::LDARW, AArch64::LDARX};
2729         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2730       } else {
2731         static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2732                                      AArch64::STLRW, AArch64::STLRX};
2733         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2734       }
2735       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2736       return true;
2737     }
2738 
2739 #ifndef NDEBUG
2740     const Register PtrReg = I.getOperand(1).getReg();
2741     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2742     // Sanity-check the pointer register.
2743     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2744            "Load/Store pointer operand isn't a GPR");
2745     assert(MRI.getType(PtrReg).isPointer() &&
2746            "Load/Store pointer operand isn't a pointer");
2747 #endif
2748 
2749     const Register ValReg = I.getOperand(0).getReg();
2750     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2751 
2752     // Helper lambda for partially selecting I. Either returns the original
2753     // instruction with an updated opcode, or a new instruction.
2754     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2755       bool IsStore = I.getOpcode() == TargetOpcode::G_STORE;
2756       const unsigned NewOpc =
2757           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2758       if (NewOpc == I.getOpcode())
2759         return nullptr;
2760       // Check if we can fold anything into the addressing mode.
2761       auto AddrModeFns =
2762           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2763       if (!AddrModeFns) {
2764         // Can't fold anything. Use the original instruction.
2765         I.setDesc(TII.get(NewOpc));
2766         I.addOperand(MachineOperand::CreateImm(0));
2767         return &I;
2768       }
2769 
2770       // Folded something. Create a new instruction and return it.
2771       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2772       IsStore ? NewInst.addUse(ValReg) : NewInst.addDef(ValReg);
2773       NewInst.cloneMemRefs(I);
2774       for (auto &Fn : *AddrModeFns)
2775         Fn(NewInst);
2776       I.eraseFromParent();
2777       return &*NewInst;
2778     };
2779 
2780     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2781     if (!LoadStore)
2782       return false;
2783 
2784     // If we're storing a 0, use WZR/XZR.
2785     if (Opcode == TargetOpcode::G_STORE) {
2786       auto CVal = getConstantVRegValWithLookThrough(
2787           LoadStore->getOperand(0).getReg(), MRI, /*LookThroughInstrs = */ true,
2788           /*HandleFConstants = */ false);
2789       if (CVal && CVal->Value == 0) {
2790         switch (LoadStore->getOpcode()) {
2791         case AArch64::STRWui:
2792         case AArch64::STRHHui:
2793         case AArch64::STRBBui:
2794           LoadStore->getOperand(0).setReg(AArch64::WZR);
2795           break;
2796         case AArch64::STRXui:
2797           LoadStore->getOperand(0).setReg(AArch64::XZR);
2798           break;
2799         }
2800       }
2801     }
2802 
2803     if (IsZExtLoad) {
2804       // The zextload from a smaller type to i32 should be handled by the
2805       // importer.
2806       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2807         return false;
2808       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2809       // and zero_extend with SUBREG_TO_REG.
2810       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2811       Register DstReg = LoadStore->getOperand(0).getReg();
2812       LoadStore->getOperand(0).setReg(LdReg);
2813 
2814       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2815       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2816           .addImm(0)
2817           .addUse(LdReg)
2818           .addImm(AArch64::sub_32);
2819       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2820       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2821                                           MRI);
2822     }
2823     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2824   }
2825 
2826   case TargetOpcode::G_SMULH:
2827   case TargetOpcode::G_UMULH: {
2828     // Reject the various things we don't support yet.
2829     if (unsupportedBinOp(I, RBI, MRI, TRI))
2830       return false;
2831 
2832     const Register DefReg = I.getOperand(0).getReg();
2833     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2834 
2835     if (RB.getID() != AArch64::GPRRegBankID) {
2836       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2837       return false;
2838     }
2839 
2840     if (Ty != LLT::scalar(64)) {
2841       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2842                         << ", expected: " << LLT::scalar(64) << '\n');
2843       return false;
2844     }
2845 
2846     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2847                                                              : AArch64::UMULHrr;
2848     I.setDesc(TII.get(NewOpc));
2849 
2850     // Now that we selected an opcode, we need to constrain the register
2851     // operands to use appropriate classes.
2852     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2853   }
2854   case TargetOpcode::G_LSHR:
2855   case TargetOpcode::G_ASHR:
2856     if (MRI.getType(I.getOperand(0).getReg()).isVector())
2857       return selectVectorAshrLshr(I, MRI);
2858     LLVM_FALLTHROUGH;
2859   case TargetOpcode::G_SHL:
2860     if (Opcode == TargetOpcode::G_SHL &&
2861         MRI.getType(I.getOperand(0).getReg()).isVector())
2862       return selectVectorSHL(I, MRI);
2863     LLVM_FALLTHROUGH;
2864   case TargetOpcode::G_FADD:
2865   case TargetOpcode::G_FSUB:
2866   case TargetOpcode::G_FMUL:
2867   case TargetOpcode::G_FDIV:
2868   case TargetOpcode::G_OR: {
2869     // Reject the various things we don't support yet.
2870     if (unsupportedBinOp(I, RBI, MRI, TRI))
2871       return false;
2872 
2873     const unsigned OpSize = Ty.getSizeInBits();
2874 
2875     const Register DefReg = I.getOperand(0).getReg();
2876     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2877 
2878     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
2879     if (NewOpc == I.getOpcode())
2880       return false;
2881 
2882     I.setDesc(TII.get(NewOpc));
2883     // FIXME: Should the type be always reset in setDesc?
2884 
2885     // Now that we selected an opcode, we need to constrain the register
2886     // operands to use appropriate classes.
2887     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2888   }
2889 
2890   case TargetOpcode::G_PTR_ADD: {
2891     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
2892     I.eraseFromParent();
2893     return true;
2894   }
2895   case TargetOpcode::G_SADDO:
2896   case TargetOpcode::G_UADDO:
2897   case TargetOpcode::G_SSUBO:
2898   case TargetOpcode::G_USUBO: {
2899     // Emit the operation and get the correct condition code.
2900     auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
2901                                   I.getOperand(2), I.getOperand(3), MIB);
2902 
2903     // Now, put the overflow result in the register given by the first operand
2904     // to the overflow op. CSINC increments the result when the predicate is
2905     // false, so to get the increment when it's true, we need to use the
2906     // inverse. In this case, we want to increment when carry is set.
2907     Register ZReg = AArch64::WZR;
2908     auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
2909                                  {ZReg, ZReg})
2910                       .addImm(getInvertedCondCode(OpAndCC.second));
2911     constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
2912     I.eraseFromParent();
2913     return true;
2914   }
2915 
2916   case TargetOpcode::G_PTRMASK: {
2917     Register MaskReg = I.getOperand(2).getReg();
2918     Optional<int64_t> MaskVal = getConstantVRegSExtVal(MaskReg, MRI);
2919     // TODO: Implement arbitrary cases
2920     if (!MaskVal || !isShiftedMask_64(*MaskVal))
2921       return false;
2922 
2923     uint64_t Mask = *MaskVal;
2924     I.setDesc(TII.get(AArch64::ANDXri));
2925     I.getOperand(2).ChangeToImmediate(
2926         AArch64_AM::encodeLogicalImmediate(Mask, 64));
2927 
2928     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2929   }
2930   case TargetOpcode::G_PTRTOINT:
2931   case TargetOpcode::G_TRUNC: {
2932     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2933     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
2934 
2935     const Register DstReg = I.getOperand(0).getReg();
2936     const Register SrcReg = I.getOperand(1).getReg();
2937 
2938     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2939     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2940 
2941     if (DstRB.getID() != SrcRB.getID()) {
2942       LLVM_DEBUG(
2943           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
2944       return false;
2945     }
2946 
2947     if (DstRB.getID() == AArch64::GPRRegBankID) {
2948       const TargetRegisterClass *DstRC =
2949           getRegClassForTypeOnBank(DstTy, DstRB, RBI);
2950       if (!DstRC)
2951         return false;
2952 
2953       const TargetRegisterClass *SrcRC =
2954           getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
2955       if (!SrcRC)
2956         return false;
2957 
2958       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
2959           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
2960         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
2961         return false;
2962       }
2963 
2964       if (DstRC == SrcRC) {
2965         // Nothing to be done
2966       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
2967                  SrcTy == LLT::scalar(64)) {
2968         llvm_unreachable("TableGen can import this case");
2969         return false;
2970       } else if (DstRC == &AArch64::GPR32RegClass &&
2971                  SrcRC == &AArch64::GPR64RegClass) {
2972         I.getOperand(1).setSubReg(AArch64::sub_32);
2973       } else {
2974         LLVM_DEBUG(
2975             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
2976         return false;
2977       }
2978 
2979       I.setDesc(TII.get(TargetOpcode::COPY));
2980       return true;
2981     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
2982       if (DstTy == LLT::fixed_vector(4, 16) &&
2983           SrcTy == LLT::fixed_vector(4, 32)) {
2984         I.setDesc(TII.get(AArch64::XTNv4i16));
2985         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2986         return true;
2987       }
2988 
2989       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
2990         MachineInstr *Extract = emitExtractVectorElt(
2991             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
2992         if (!Extract)
2993           return false;
2994         I.eraseFromParent();
2995         return true;
2996       }
2997 
2998       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
2999       if (Opcode == TargetOpcode::G_PTRTOINT) {
3000         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3001         I.setDesc(TII.get(TargetOpcode::COPY));
3002         return true;
3003       }
3004     }
3005 
3006     return false;
3007   }
3008 
3009   case TargetOpcode::G_ANYEXT: {
3010     const Register DstReg = I.getOperand(0).getReg();
3011     const Register SrcReg = I.getOperand(1).getReg();
3012 
3013     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3014     if (RBDst.getID() != AArch64::GPRRegBankID) {
3015       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3016                         << ", expected: GPR\n");
3017       return false;
3018     }
3019 
3020     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3021     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3022       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3023                         << ", expected: GPR\n");
3024       return false;
3025     }
3026 
3027     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3028 
3029     if (DstSize == 0) {
3030       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3031       return false;
3032     }
3033 
3034     if (DstSize != 64 && DstSize > 32) {
3035       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3036                         << ", expected: 32 or 64\n");
3037       return false;
3038     }
3039     // At this point G_ANYEXT is just like a plain COPY, but we need
3040     // to explicitly form the 64-bit value if any.
3041     if (DstSize > 32) {
3042       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3043       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3044           .addDef(ExtSrc)
3045           .addImm(0)
3046           .addUse(SrcReg)
3047           .addImm(AArch64::sub_32);
3048       I.getOperand(1).setReg(ExtSrc);
3049     }
3050     return selectCopy(I, TII, MRI, TRI, RBI);
3051   }
3052 
3053   case TargetOpcode::G_ZEXT:
3054   case TargetOpcode::G_SEXT_INREG:
3055   case TargetOpcode::G_SEXT: {
3056     unsigned Opcode = I.getOpcode();
3057     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3058     const Register DefReg = I.getOperand(0).getReg();
3059     Register SrcReg = I.getOperand(1).getReg();
3060     const LLT DstTy = MRI.getType(DefReg);
3061     const LLT SrcTy = MRI.getType(SrcReg);
3062     unsigned DstSize = DstTy.getSizeInBits();
3063     unsigned SrcSize = SrcTy.getSizeInBits();
3064 
3065     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3066     // extended is encoded in the imm.
3067     if (Opcode == TargetOpcode::G_SEXT_INREG)
3068       SrcSize = I.getOperand(2).getImm();
3069 
3070     if (DstTy.isVector())
3071       return false; // Should be handled by imported patterns.
3072 
3073     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3074                AArch64::GPRRegBankID &&
3075            "Unexpected ext regbank");
3076 
3077     MachineInstr *ExtI;
3078 
3079     // First check if we're extending the result of a load which has a dest type
3080     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3081     // GPR register on AArch64 and all loads which are smaller automatically
3082     // zero-extend the upper bits. E.g.
3083     // %v(s8) = G_LOAD %p, :: (load 1)
3084     // %v2(s32) = G_ZEXT %v(s8)
3085     if (!IsSigned) {
3086       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3087       bool IsGPR =
3088           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3089       if (LoadMI && IsGPR) {
3090         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3091         unsigned BytesLoaded = MemOp->getSize();
3092         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3093           return selectCopy(I, TII, MRI, TRI, RBI);
3094       }
3095 
3096       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3097       // + SUBREG_TO_REG.
3098       //
3099       // If we are zero extending from 32 bits to 64 bits, it's possible that
3100       // the instruction implicitly does the zero extend for us. In that case,
3101       // we only need the SUBREG_TO_REG.
3102       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3103         // Unlike with the G_LOAD case, we don't want to look through copies
3104         // here. (See isDef32.)
3105         MachineInstr *Def = MRI.getVRegDef(SrcReg);
3106         Register SubregToRegSrc = SrcReg;
3107 
3108         // Does the instruction implicitly zero extend?
3109         if (!Def || !isDef32(*Def)) {
3110           // No. Zero out using an OR.
3111           Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3112           const Register ZReg = AArch64::WZR;
3113           MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
3114           SubregToRegSrc = OrDst;
3115         }
3116 
3117         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3118             .addImm(0)
3119             .addUse(SubregToRegSrc)
3120             .addImm(AArch64::sub_32);
3121 
3122         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3123                                           MRI)) {
3124           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3125           return false;
3126         }
3127 
3128         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3129                                           MRI)) {
3130           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3131           return false;
3132         }
3133 
3134         I.eraseFromParent();
3135         return true;
3136       }
3137     }
3138 
3139     if (DstSize == 64) {
3140       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3141         // FIXME: Can we avoid manually doing this?
3142         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3143                                           MRI)) {
3144           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3145                             << " operand\n");
3146           return false;
3147         }
3148         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3149                                 {&AArch64::GPR64RegClass}, {})
3150                      .addImm(0)
3151                      .addUse(SrcReg)
3152                      .addImm(AArch64::sub_32)
3153                      .getReg(0);
3154       }
3155 
3156       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3157                              {DefReg}, {SrcReg})
3158                   .addImm(0)
3159                   .addImm(SrcSize - 1);
3160     } else if (DstSize <= 32) {
3161       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3162                              {DefReg}, {SrcReg})
3163                   .addImm(0)
3164                   .addImm(SrcSize - 1);
3165     } else {
3166       return false;
3167     }
3168 
3169     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3170     I.eraseFromParent();
3171     return true;
3172   }
3173 
3174   case TargetOpcode::G_SITOFP:
3175   case TargetOpcode::G_UITOFP:
3176   case TargetOpcode::G_FPTOSI:
3177   case TargetOpcode::G_FPTOUI: {
3178     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3179               SrcTy = MRI.getType(I.getOperand(1).getReg());
3180     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3181     if (NewOpc == Opcode)
3182       return false;
3183 
3184     I.setDesc(TII.get(NewOpc));
3185     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3186 
3187     return true;
3188   }
3189 
3190   case TargetOpcode::G_FREEZE:
3191     return selectCopy(I, TII, MRI, TRI, RBI);
3192 
3193   case TargetOpcode::G_INTTOPTR:
3194     // The importer is currently unable to import pointer types since they
3195     // didn't exist in SelectionDAG.
3196     return selectCopy(I, TII, MRI, TRI, RBI);
3197 
3198   case TargetOpcode::G_BITCAST:
3199     // Imported SelectionDAG rules can handle every bitcast except those that
3200     // bitcast from a type to the same type. Ideally, these shouldn't occur
3201     // but we might not run an optimizer that deletes them. The other exception
3202     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3203     // of them.
3204     return selectCopy(I, TII, MRI, TRI, RBI);
3205 
3206   case TargetOpcode::G_SELECT: {
3207     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
3208       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
3209                         << ", expected: " << LLT::scalar(1) << '\n');
3210       return false;
3211     }
3212 
3213     const Register CondReg = I.getOperand(1).getReg();
3214     const Register TReg = I.getOperand(2).getReg();
3215     const Register FReg = I.getOperand(3).getReg();
3216 
3217     if (tryOptSelect(I))
3218       return true;
3219 
3220     // Make sure to use an unused vreg instead of wzr, so that the peephole
3221     // optimizations will be able to optimize these.
3222     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3223     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3224                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3225     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3226     if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
3227       return false;
3228     I.eraseFromParent();
3229     return true;
3230   }
3231   case TargetOpcode::G_ICMP: {
3232     if (Ty.isVector())
3233       return selectVectorICmp(I, MRI);
3234 
3235     if (Ty != LLT::scalar(32)) {
3236       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3237                         << ", expected: " << LLT::scalar(32) << '\n');
3238       return false;
3239     }
3240 
3241     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3242     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
3243                        MIB);
3244     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
3245     I.eraseFromParent();
3246     return true;
3247   }
3248 
3249   case TargetOpcode::G_FCMP: {
3250     CmpInst::Predicate Pred =
3251         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3252     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3253                        Pred) ||
3254         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3255       return false;
3256     I.eraseFromParent();
3257     return true;
3258   }
3259   case TargetOpcode::G_VASTART:
3260     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3261                                 : selectVaStartAAPCS(I, MF, MRI);
3262   case TargetOpcode::G_INTRINSIC:
3263     return selectIntrinsic(I, MRI);
3264   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3265     return selectIntrinsicWithSideEffects(I, MRI);
3266   case TargetOpcode::G_IMPLICIT_DEF: {
3267     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3268     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3269     const Register DstReg = I.getOperand(0).getReg();
3270     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3271     const TargetRegisterClass *DstRC =
3272         getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3273     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3274     return true;
3275   }
3276   case TargetOpcode::G_BLOCK_ADDR: {
3277     if (TM.getCodeModel() == CodeModel::Large) {
3278       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3279       I.eraseFromParent();
3280       return true;
3281     } else {
3282       I.setDesc(TII.get(AArch64::MOVaddrBA));
3283       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3284                            I.getOperand(0).getReg())
3285                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3286                                         /* Offset */ 0, AArch64II::MO_PAGE)
3287                        .addBlockAddress(
3288                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3289                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3290       I.eraseFromParent();
3291       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3292     }
3293   }
3294   case AArch64::G_DUP: {
3295     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3296     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3297     // difficult because at RBS we may end up pessimizing the fpr case if we
3298     // decided to add an anyextend to fix this. Manual selection is the most
3299     // robust solution for now.
3300     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3301         AArch64::GPRRegBankID)
3302       return false; // We expect the fpr regbank case to be imported.
3303     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3304     if (VecTy == LLT::fixed_vector(8, 8))
3305       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3306     else if (VecTy == LLT::fixed_vector(16, 8))
3307       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3308     else if (VecTy == LLT::fixed_vector(4, 16))
3309       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3310     else if (VecTy == LLT::fixed_vector(8, 16))
3311       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3312     else
3313       return false;
3314     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3315   }
3316   case TargetOpcode::G_INTRINSIC_TRUNC:
3317     return selectIntrinsicTrunc(I, MRI);
3318   case TargetOpcode::G_INTRINSIC_ROUND:
3319     return selectIntrinsicRound(I, MRI);
3320   case TargetOpcode::G_BUILD_VECTOR:
3321     return selectBuildVector(I, MRI);
3322   case TargetOpcode::G_MERGE_VALUES:
3323     return selectMergeValues(I, MRI);
3324   case TargetOpcode::G_UNMERGE_VALUES:
3325     return selectUnmergeValues(I, MRI);
3326   case TargetOpcode::G_SHUFFLE_VECTOR:
3327     return selectShuffleVector(I, MRI);
3328   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3329     return selectExtractElt(I, MRI);
3330   case TargetOpcode::G_INSERT_VECTOR_ELT:
3331     return selectInsertElt(I, MRI);
3332   case TargetOpcode::G_CONCAT_VECTORS:
3333     return selectConcatVectors(I, MRI);
3334   case TargetOpcode::G_JUMP_TABLE:
3335     return selectJumpTable(I, MRI);
3336   case TargetOpcode::G_VECREDUCE_FADD:
3337   case TargetOpcode::G_VECREDUCE_ADD:
3338     return selectReduction(I, MRI);
3339   }
3340 
3341   return false;
3342 }
3343 
selectReduction(MachineInstr & I,MachineRegisterInfo & MRI)3344 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3345                                                  MachineRegisterInfo &MRI) {
3346   Register VecReg = I.getOperand(1).getReg();
3347   LLT VecTy = MRI.getType(VecReg);
3348   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3349     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3350     // a subregister copy afterwards.
3351     if (VecTy == LLT::fixed_vector(2, 32)) {
3352       Register DstReg = I.getOperand(0).getReg();
3353       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3354                                  {VecReg, VecReg});
3355       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3356                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3357                       .getReg(0);
3358       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3359       I.eraseFromParent();
3360       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3361     }
3362 
3363     unsigned Opc = 0;
3364     if (VecTy == LLT::fixed_vector(16, 8))
3365       Opc = AArch64::ADDVv16i8v;
3366     else if (VecTy == LLT::fixed_vector(8, 16))
3367       Opc = AArch64::ADDVv8i16v;
3368     else if (VecTy == LLT::fixed_vector(4, 32))
3369       Opc = AArch64::ADDVv4i32v;
3370     else if (VecTy == LLT::fixed_vector(2, 64))
3371       Opc = AArch64::ADDPv2i64p;
3372     else {
3373       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3374       return false;
3375     }
3376     I.setDesc(TII.get(Opc));
3377     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3378   }
3379 
3380   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3381     unsigned Opc = 0;
3382     if (VecTy == LLT::fixed_vector(2, 32))
3383       Opc = AArch64::FADDPv2i32p;
3384     else if (VecTy == LLT::fixed_vector(2, 64))
3385       Opc = AArch64::FADDPv2i64p;
3386     else {
3387       LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3388       return false;
3389     }
3390     I.setDesc(TII.get(Opc));
3391     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3392   }
3393   return false;
3394 }
3395 
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3396 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3397                                             MachineRegisterInfo &MRI) {
3398   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3399   Register JTAddr = I.getOperand(0).getReg();
3400   unsigned JTI = I.getOperand(1).getIndex();
3401   Register Index = I.getOperand(2).getReg();
3402 
3403   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3404   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3405 
3406   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3407   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3408                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3409                            .addJumpTableIndex(JTI);
3410   // Build the indirect branch.
3411   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3412   I.eraseFromParent();
3413   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3414 }
3415 
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3416 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3417                                                  MachineRegisterInfo &MRI) {
3418   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3419   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3420 
3421   Register DstReg = I.getOperand(0).getReg();
3422   unsigned JTI = I.getOperand(1).getIndex();
3423   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3424   auto MovMI =
3425     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3426           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3427           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3428   I.eraseFromParent();
3429   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3430 }
3431 
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3432 bool AArch64InstructionSelector::selectTLSGlobalValue(
3433     MachineInstr &I, MachineRegisterInfo &MRI) {
3434   if (!STI.isTargetMachO())
3435     return false;
3436   MachineFunction &MF = *I.getParent()->getParent();
3437   MF.getFrameInfo().setAdjustsStack(true);
3438 
3439   const auto &GlobalOp = I.getOperand(1);
3440   assert(GlobalOp.getOffset() == 0 &&
3441          "Shouldn't have an offset on TLS globals!");
3442   const GlobalValue &GV = *GlobalOp.getGlobal();
3443 
3444   auto LoadGOT =
3445       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3446           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3447 
3448   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3449                              {LoadGOT.getReg(0)})
3450                   .addImm(0);
3451 
3452   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3453   // TLS calls preserve all registers except those that absolutely must be
3454   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3455   // silly).
3456   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3457       .addUse(AArch64::X0, RegState::Implicit)
3458       .addDef(AArch64::X0, RegState::Implicit)
3459       .addRegMask(TRI.getTLSCallPreservedMask());
3460 
3461   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3462   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3463                                MRI);
3464   I.eraseFromParent();
3465   return true;
3466 }
3467 
selectIntrinsicTrunc(MachineInstr & I,MachineRegisterInfo & MRI) const3468 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3469     MachineInstr &I, MachineRegisterInfo &MRI) const {
3470   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3471 
3472   // Select the correct opcode.
3473   unsigned Opc = 0;
3474   if (!SrcTy.isVector()) {
3475     switch (SrcTy.getSizeInBits()) {
3476     default:
3477     case 16:
3478       Opc = AArch64::FRINTZHr;
3479       break;
3480     case 32:
3481       Opc = AArch64::FRINTZSr;
3482       break;
3483     case 64:
3484       Opc = AArch64::FRINTZDr;
3485       break;
3486     }
3487   } else {
3488     unsigned NumElts = SrcTy.getNumElements();
3489     switch (SrcTy.getElementType().getSizeInBits()) {
3490     default:
3491       break;
3492     case 16:
3493       if (NumElts == 4)
3494         Opc = AArch64::FRINTZv4f16;
3495       else if (NumElts == 8)
3496         Opc = AArch64::FRINTZv8f16;
3497       break;
3498     case 32:
3499       if (NumElts == 2)
3500         Opc = AArch64::FRINTZv2f32;
3501       else if (NumElts == 4)
3502         Opc = AArch64::FRINTZv4f32;
3503       break;
3504     case 64:
3505       if (NumElts == 2)
3506         Opc = AArch64::FRINTZv2f64;
3507       break;
3508     }
3509   }
3510 
3511   if (!Opc) {
3512     // Didn't get an opcode above, bail.
3513     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3514     return false;
3515   }
3516 
3517   // Legalization would have set us up perfectly for this; we just need to
3518   // set the opcode and move on.
3519   I.setDesc(TII.get(Opc));
3520   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3521 }
3522 
selectIntrinsicRound(MachineInstr & I,MachineRegisterInfo & MRI) const3523 bool AArch64InstructionSelector::selectIntrinsicRound(
3524     MachineInstr &I, MachineRegisterInfo &MRI) const {
3525   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3526 
3527   // Select the correct opcode.
3528   unsigned Opc = 0;
3529   if (!SrcTy.isVector()) {
3530     switch (SrcTy.getSizeInBits()) {
3531     default:
3532     case 16:
3533       Opc = AArch64::FRINTAHr;
3534       break;
3535     case 32:
3536       Opc = AArch64::FRINTASr;
3537       break;
3538     case 64:
3539       Opc = AArch64::FRINTADr;
3540       break;
3541     }
3542   } else {
3543     unsigned NumElts = SrcTy.getNumElements();
3544     switch (SrcTy.getElementType().getSizeInBits()) {
3545     default:
3546       break;
3547     case 16:
3548       if (NumElts == 4)
3549         Opc = AArch64::FRINTAv4f16;
3550       else if (NumElts == 8)
3551         Opc = AArch64::FRINTAv8f16;
3552       break;
3553     case 32:
3554       if (NumElts == 2)
3555         Opc = AArch64::FRINTAv2f32;
3556       else if (NumElts == 4)
3557         Opc = AArch64::FRINTAv4f32;
3558       break;
3559     case 64:
3560       if (NumElts == 2)
3561         Opc = AArch64::FRINTAv2f64;
3562       break;
3563     }
3564   }
3565 
3566   if (!Opc) {
3567     // Didn't get an opcode above, bail.
3568     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3569     return false;
3570   }
3571 
3572   // Legalization would have set us up perfectly for this; we just need to
3573   // set the opcode and move on.
3574   I.setDesc(TII.get(Opc));
3575   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3576 }
3577 
selectVectorICmp(MachineInstr & I,MachineRegisterInfo & MRI)3578 bool AArch64InstructionSelector::selectVectorICmp(
3579     MachineInstr &I, MachineRegisterInfo &MRI) {
3580   Register DstReg = I.getOperand(0).getReg();
3581   LLT DstTy = MRI.getType(DstReg);
3582   Register SrcReg = I.getOperand(2).getReg();
3583   Register Src2Reg = I.getOperand(3).getReg();
3584   LLT SrcTy = MRI.getType(SrcReg);
3585 
3586   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3587   unsigned NumElts = DstTy.getNumElements();
3588 
3589   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3590   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3591   // Third index is cc opcode:
3592   // 0 == eq
3593   // 1 == ugt
3594   // 2 == uge
3595   // 3 == ult
3596   // 4 == ule
3597   // 5 == sgt
3598   // 6 == sge
3599   // 7 == slt
3600   // 8 == sle
3601   // ne is done by negating 'eq' result.
3602 
3603   // This table below assumes that for some comparisons the operands will be
3604   // commuted.
3605   // ult op == commute + ugt op
3606   // ule op == commute + uge op
3607   // slt op == commute + sgt op
3608   // sle op == commute + sge op
3609   unsigned PredIdx = 0;
3610   bool SwapOperands = false;
3611   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3612   switch (Pred) {
3613   case CmpInst::ICMP_NE:
3614   case CmpInst::ICMP_EQ:
3615     PredIdx = 0;
3616     break;
3617   case CmpInst::ICMP_UGT:
3618     PredIdx = 1;
3619     break;
3620   case CmpInst::ICMP_UGE:
3621     PredIdx = 2;
3622     break;
3623   case CmpInst::ICMP_ULT:
3624     PredIdx = 3;
3625     SwapOperands = true;
3626     break;
3627   case CmpInst::ICMP_ULE:
3628     PredIdx = 4;
3629     SwapOperands = true;
3630     break;
3631   case CmpInst::ICMP_SGT:
3632     PredIdx = 5;
3633     break;
3634   case CmpInst::ICMP_SGE:
3635     PredIdx = 6;
3636     break;
3637   case CmpInst::ICMP_SLT:
3638     PredIdx = 7;
3639     SwapOperands = true;
3640     break;
3641   case CmpInst::ICMP_SLE:
3642     PredIdx = 8;
3643     SwapOperands = true;
3644     break;
3645   default:
3646     llvm_unreachable("Unhandled icmp predicate");
3647     return false;
3648   }
3649 
3650   // This table obviously should be tablegen'd when we have our GISel native
3651   // tablegen selector.
3652 
3653   static const unsigned OpcTable[4][4][9] = {
3654       {
3655           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3656            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3657            0 /* invalid */},
3658           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3659            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3660            0 /* invalid */},
3661           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3662            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3663            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3664           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3665            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3666            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3667       },
3668       {
3669           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3670            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3671            0 /* invalid */},
3672           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3673            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3674            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3675           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3676            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3677            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3678           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3679            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3680            0 /* invalid */}
3681       },
3682       {
3683           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3684            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3685            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3686           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3687            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3688            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3689           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3690            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3691            0 /* invalid */},
3692           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3693            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3694            0 /* invalid */}
3695       },
3696       {
3697           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3698            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3699            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3700           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3701            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3702            0 /* invalid */},
3703           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3704            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3705            0 /* invalid */},
3706           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3707            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3708            0 /* invalid */}
3709       },
3710   };
3711   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3712   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3713   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3714   if (!Opc) {
3715     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3716     return false;
3717   }
3718 
3719   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3720   const TargetRegisterClass *SrcRC =
3721       getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
3722   if (!SrcRC) {
3723     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3724     return false;
3725   }
3726 
3727   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3728   if (SrcTy.getSizeInBits() == 128)
3729     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3730 
3731   if (SwapOperands)
3732     std::swap(SrcReg, Src2Reg);
3733 
3734   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3735   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3736 
3737   // Invert if we had a 'ne' cc.
3738   if (NotOpc) {
3739     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3740     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3741   } else {
3742     MIB.buildCopy(DstReg, Cmp.getReg(0));
3743   }
3744   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3745   I.eraseFromParent();
3746   return true;
3747 }
3748 
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3749 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3750     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3751     MachineIRBuilder &MIRBuilder) const {
3752   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3753 
3754   auto BuildFn = [&](unsigned SubregIndex) {
3755     auto Ins =
3756         MIRBuilder
3757             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3758             .addImm(SubregIndex);
3759     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3760     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3761     return &*Ins;
3762   };
3763 
3764   switch (EltSize) {
3765   case 16:
3766     return BuildFn(AArch64::hsub);
3767   case 32:
3768     return BuildFn(AArch64::ssub);
3769   case 64:
3770     return BuildFn(AArch64::dsub);
3771   default:
3772     return nullptr;
3773   }
3774 }
3775 
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3776 bool AArch64InstructionSelector::selectMergeValues(
3777     MachineInstr &I, MachineRegisterInfo &MRI) {
3778   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3779   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3780   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3781   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3782   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3783 
3784   if (I.getNumOperands() != 3)
3785     return false;
3786 
3787   // Merging 2 s64s into an s128.
3788   if (DstTy == LLT::scalar(128)) {
3789     if (SrcTy.getSizeInBits() != 64)
3790       return false;
3791     Register DstReg = I.getOperand(0).getReg();
3792     Register Src1Reg = I.getOperand(1).getReg();
3793     Register Src2Reg = I.getOperand(2).getReg();
3794     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3795     MachineInstr *InsMI =
3796         emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3797     if (!InsMI)
3798       return false;
3799     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3800                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3801     if (!Ins2MI)
3802       return false;
3803     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3804     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3805     I.eraseFromParent();
3806     return true;
3807   }
3808 
3809   if (RB.getID() != AArch64::GPRRegBankID)
3810     return false;
3811 
3812   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3813     return false;
3814 
3815   auto *DstRC = &AArch64::GPR64RegClass;
3816   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3817   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3818                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3819                                 .addDef(SubToRegDef)
3820                                 .addImm(0)
3821                                 .addUse(I.getOperand(1).getReg())
3822                                 .addImm(AArch64::sub_32);
3823   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3824   // Need to anyext the second scalar before we can use bfm
3825   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3826                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3827                                 .addDef(SubToRegDef2)
3828                                 .addImm(0)
3829                                 .addUse(I.getOperand(2).getReg())
3830                                 .addImm(AArch64::sub_32);
3831   MachineInstr &BFM =
3832       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3833            .addDef(I.getOperand(0).getReg())
3834            .addUse(SubToRegDef)
3835            .addUse(SubToRegDef2)
3836            .addImm(32)
3837            .addImm(31);
3838   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3839   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3840   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3841   I.eraseFromParent();
3842   return true;
3843 }
3844 
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)3845 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3846                               const unsigned EltSize) {
3847   // Choose a lane copy opcode and subregister based off of the size of the
3848   // vector's elements.
3849   switch (EltSize) {
3850   case 16:
3851     CopyOpc = AArch64::CPYi16;
3852     ExtractSubReg = AArch64::hsub;
3853     break;
3854   case 32:
3855     CopyOpc = AArch64::CPYi32;
3856     ExtractSubReg = AArch64::ssub;
3857     break;
3858   case 64:
3859     CopyOpc = AArch64::CPYi64;
3860     ExtractSubReg = AArch64::dsub;
3861     break;
3862   default:
3863     // Unknown size, bail out.
3864     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3865     return false;
3866   }
3867   return true;
3868 }
3869 
emitExtractVectorElt(Optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const3870 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3871     Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3872     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3873   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3874   unsigned CopyOpc = 0;
3875   unsigned ExtractSubReg = 0;
3876   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
3877     LLVM_DEBUG(
3878         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3879     return nullptr;
3880   }
3881 
3882   const TargetRegisterClass *DstRC =
3883       getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
3884   if (!DstRC) {
3885     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3886     return nullptr;
3887   }
3888 
3889   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3890   const LLT &VecTy = MRI.getType(VecReg);
3891   const TargetRegisterClass *VecRC =
3892       getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
3893   if (!VecRC) {
3894     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3895     return nullptr;
3896   }
3897 
3898   // The register that we're going to copy into.
3899   Register InsertReg = VecReg;
3900   if (!DstReg)
3901     DstReg = MRI.createVirtualRegister(DstRC);
3902   // If the lane index is 0, we just use a subregister COPY.
3903   if (LaneIdx == 0) {
3904     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
3905                     .addReg(VecReg, 0, ExtractSubReg);
3906     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3907     return &*Copy;
3908   }
3909 
3910   // Lane copies require 128-bit wide registers. If we're dealing with an
3911   // unpacked vector, then we need to move up to that width. Insert an implicit
3912   // def and a subregister insert to get us there.
3913   if (VecTy.getSizeInBits() != 128) {
3914     MachineInstr *ScalarToVector = emitScalarToVector(
3915         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
3916     if (!ScalarToVector)
3917       return nullptr;
3918     InsertReg = ScalarToVector->getOperand(0).getReg();
3919   }
3920 
3921   MachineInstr *LaneCopyMI =
3922       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
3923   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
3924 
3925   // Make sure that we actually constrain the initial copy.
3926   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3927   return LaneCopyMI;
3928 }
3929 
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)3930 bool AArch64InstructionSelector::selectExtractElt(
3931     MachineInstr &I, MachineRegisterInfo &MRI) {
3932   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
3933          "unexpected opcode!");
3934   Register DstReg = I.getOperand(0).getReg();
3935   const LLT NarrowTy = MRI.getType(DstReg);
3936   const Register SrcReg = I.getOperand(1).getReg();
3937   const LLT WideTy = MRI.getType(SrcReg);
3938   (void)WideTy;
3939   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
3940          "source register size too small!");
3941   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
3942 
3943   // Need the lane index to determine the correct copy opcode.
3944   MachineOperand &LaneIdxOp = I.getOperand(2);
3945   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
3946 
3947   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
3948     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
3949     return false;
3950   }
3951 
3952   // Find the index to extract from.
3953   auto VRegAndVal = getConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
3954   if (!VRegAndVal)
3955     return false;
3956   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
3957 
3958 
3959   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3960   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
3961                                                LaneIdx, MIB);
3962   if (!Extract)
3963     return false;
3964 
3965   I.eraseFromParent();
3966   return true;
3967 }
3968 
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)3969 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
3970     MachineInstr &I, MachineRegisterInfo &MRI) {
3971   unsigned NumElts = I.getNumOperands() - 1;
3972   Register SrcReg = I.getOperand(NumElts).getReg();
3973   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
3974   const LLT SrcTy = MRI.getType(SrcReg);
3975 
3976   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
3977   if (SrcTy.getSizeInBits() > 128) {
3978     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
3979     return false;
3980   }
3981 
3982   // We implement a split vector operation by treating the sub-vectors as
3983   // scalars and extracting them.
3984   const RegisterBank &DstRB =
3985       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
3986   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
3987     Register Dst = I.getOperand(OpIdx).getReg();
3988     MachineInstr *Extract =
3989         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
3990     if (!Extract)
3991       return false;
3992   }
3993   I.eraseFromParent();
3994   return true;
3995 }
3996 
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3997 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
3998                                                      MachineRegisterInfo &MRI) {
3999   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4000          "unexpected opcode");
4001 
4002   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4003   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4004           AArch64::FPRRegBankID ||
4005       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4006           AArch64::FPRRegBankID) {
4007     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4008                          "currently unsupported.\n");
4009     return false;
4010   }
4011 
4012   // The last operand is the vector source register, and every other operand is
4013   // a register to unpack into.
4014   unsigned NumElts = I.getNumOperands() - 1;
4015   Register SrcReg = I.getOperand(NumElts).getReg();
4016   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4017   const LLT WideTy = MRI.getType(SrcReg);
4018   (void)WideTy;
4019   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4020          "can only unmerge from vector or s128 types!");
4021   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4022          "source register size too small!");
4023 
4024   if (!NarrowTy.isScalar())
4025     return selectSplitVectorUnmerge(I, MRI);
4026 
4027   // Choose a lane copy opcode and subregister based off of the size of the
4028   // vector's elements.
4029   unsigned CopyOpc = 0;
4030   unsigned ExtractSubReg = 0;
4031   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4032     return false;
4033 
4034   // Set up for the lane copies.
4035   MachineBasicBlock &MBB = *I.getParent();
4036 
4037   // Stores the registers we'll be copying from.
4038   SmallVector<Register, 4> InsertRegs;
4039 
4040   // We'll use the first register twice, so we only need NumElts-1 registers.
4041   unsigned NumInsertRegs = NumElts - 1;
4042 
4043   // If our elements fit into exactly 128 bits, then we can copy from the source
4044   // directly. Otherwise, we need to do a bit of setup with some subregister
4045   // inserts.
4046   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4047     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4048   } else {
4049     // No. We have to perform subregister inserts. For each insert, create an
4050     // implicit def and a subregister insert, and save the register we create.
4051     const TargetRegisterClass *RC =
4052         getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
4053                               WideTy.getScalarSizeInBits() * NumElts);
4054     unsigned SubReg = 0;
4055     bool Found = getSubRegForClass(RC, TRI, SubReg);
4056     (void)Found;
4057     assert(Found && "expected to find last operand's subeg idx");
4058     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4059       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4060       MachineInstr &ImpDefMI =
4061           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4062                    ImpDefReg);
4063 
4064       // Now, create the subregister insert from SrcReg.
4065       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4066       MachineInstr &InsMI =
4067           *BuildMI(MBB, I, I.getDebugLoc(),
4068                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4069                .addUse(ImpDefReg)
4070                .addUse(SrcReg)
4071                .addImm(SubReg);
4072 
4073       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4074       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4075 
4076       // Save the register so that we can copy from it after.
4077       InsertRegs.push_back(InsertReg);
4078     }
4079   }
4080 
4081   // Now that we've created any necessary subregister inserts, we can
4082   // create the copies.
4083   //
4084   // Perform the first copy separately as a subregister copy.
4085   Register CopyTo = I.getOperand(0).getReg();
4086   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4087                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4088   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4089 
4090   // Now, perform the remaining copies as vector lane copies.
4091   unsigned LaneIdx = 1;
4092   for (Register InsReg : InsertRegs) {
4093     Register CopyTo = I.getOperand(LaneIdx).getReg();
4094     MachineInstr &CopyInst =
4095         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4096              .addUse(InsReg)
4097              .addImm(LaneIdx);
4098     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4099     ++LaneIdx;
4100   }
4101 
4102   // Separately constrain the first copy's destination. Because of the
4103   // limitation in constrainOperandRegClass, we can't guarantee that this will
4104   // actually be constrained. So, do it ourselves using the second operand.
4105   const TargetRegisterClass *RC =
4106       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4107   if (!RC) {
4108     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4109     return false;
4110   }
4111 
4112   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4113   I.eraseFromParent();
4114   return true;
4115 }
4116 
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)4117 bool AArch64InstructionSelector::selectConcatVectors(
4118     MachineInstr &I, MachineRegisterInfo &MRI)  {
4119   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4120          "Unexpected opcode");
4121   Register Dst = I.getOperand(0).getReg();
4122   Register Op1 = I.getOperand(1).getReg();
4123   Register Op2 = I.getOperand(2).getReg();
4124   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4125   if (!ConcatMI)
4126     return false;
4127   I.eraseFromParent();
4128   return true;
4129 }
4130 
4131 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4132 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4133                                                   MachineFunction &MF) const {
4134   Type *CPTy = CPVal->getType();
4135   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4136 
4137   MachineConstantPool *MCP = MF.getConstantPool();
4138   return MCP->getConstantPoolIndex(CPVal, Alignment);
4139 }
4140 
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4141 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4142     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4143   auto &MF = MIRBuilder.getMF();
4144   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4145 
4146   auto Adrp =
4147       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4148           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4149 
4150   MachineInstr *LoadMI = nullptr;
4151   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4152   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4153   switch (Size) {
4154   case 16:
4155     LoadMI =
4156         &*MIRBuilder
4157               .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4158               .addConstantPoolIndex(CPIdx, 0,
4159                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4160     break;
4161   case 8:
4162     LoadMI =
4163         &*MIRBuilder
4164               .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4165               .addConstantPoolIndex(CPIdx, 0,
4166                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4167     break;
4168   case 4:
4169     LoadMI =
4170         &*MIRBuilder
4171               .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4172               .addConstantPoolIndex(CPIdx, 0,
4173                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4174     break;
4175   default:
4176     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4177                       << *CPVal->getType());
4178     return nullptr;
4179   }
4180   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4181                                                     MachineMemOperand::MOLoad,
4182                                                     Size, Align(Size)));
4183   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4184   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4185   return LoadMI;
4186 }
4187 
4188 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4189 /// size and RB.
4190 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4191 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4192   unsigned Opc, SubregIdx;
4193   if (RB.getID() == AArch64::GPRRegBankID) {
4194     if (EltSize == 16) {
4195       Opc = AArch64::INSvi16gpr;
4196       SubregIdx = AArch64::ssub;
4197     } else if (EltSize == 32) {
4198       Opc = AArch64::INSvi32gpr;
4199       SubregIdx = AArch64::ssub;
4200     } else if (EltSize == 64) {
4201       Opc = AArch64::INSvi64gpr;
4202       SubregIdx = AArch64::dsub;
4203     } else {
4204       llvm_unreachable("invalid elt size!");
4205     }
4206   } else {
4207     if (EltSize == 8) {
4208       Opc = AArch64::INSvi8lane;
4209       SubregIdx = AArch64::bsub;
4210     } else if (EltSize == 16) {
4211       Opc = AArch64::INSvi16lane;
4212       SubregIdx = AArch64::hsub;
4213     } else if (EltSize == 32) {
4214       Opc = AArch64::INSvi32lane;
4215       SubregIdx = AArch64::ssub;
4216     } else if (EltSize == 64) {
4217       Opc = AArch64::INSvi64lane;
4218       SubregIdx = AArch64::dsub;
4219     } else {
4220       llvm_unreachable("invalid elt size!");
4221     }
4222   }
4223   return std::make_pair(Opc, SubregIdx);
4224 }
4225 
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4226 MachineInstr *AArch64InstructionSelector::emitInstr(
4227     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4228     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4229     const ComplexRendererFns &RenderFns) const {
4230   assert(Opcode && "Expected an opcode?");
4231   assert(!isPreISelGenericOpcode(Opcode) &&
4232          "Function should only be used to produce selected instructions!");
4233   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4234   if (RenderFns)
4235     for (auto &Fn : *RenderFns)
4236       Fn(MI);
4237   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4238   return &*MI;
4239 }
4240 
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4241 MachineInstr *AArch64InstructionSelector::emitAddSub(
4242     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4243     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4244     MachineIRBuilder &MIRBuilder) const {
4245   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4246   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4247   auto Ty = MRI.getType(LHS.getReg());
4248   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4249   unsigned Size = Ty.getSizeInBits();
4250   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4251   bool Is32Bit = Size == 32;
4252 
4253   // INSTRri form with positive arithmetic immediate.
4254   if (auto Fns = selectArithImmed(RHS))
4255     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4256                      MIRBuilder, Fns);
4257 
4258   // INSTRri form with negative arithmetic immediate.
4259   if (auto Fns = selectNegArithImmed(RHS))
4260     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4261                      MIRBuilder, Fns);
4262 
4263   // INSTRrx form.
4264   if (auto Fns = selectArithExtendedRegister(RHS))
4265     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4266                      MIRBuilder, Fns);
4267 
4268   // INSTRrs form.
4269   if (auto Fns = selectShiftedRegister(RHS))
4270     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4271                      MIRBuilder, Fns);
4272   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4273                    MIRBuilder);
4274 }
4275 
4276 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4277 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4278                                     MachineOperand &RHS,
4279                                     MachineIRBuilder &MIRBuilder) const {
4280   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4281       {{AArch64::ADDXri, AArch64::ADDWri},
4282        {AArch64::ADDXrs, AArch64::ADDWrs},
4283        {AArch64::ADDXrr, AArch64::ADDWrr},
4284        {AArch64::SUBXri, AArch64::SUBWri},
4285        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4286   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4287 }
4288 
4289 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4290 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4291                                      MachineOperand &RHS,
4292                                      MachineIRBuilder &MIRBuilder) const {
4293   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4294       {{AArch64::ADDSXri, AArch64::ADDSWri},
4295        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4296        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4297        {AArch64::SUBSXri, AArch64::SUBSWri},
4298        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4299   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4300 }
4301 
4302 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4303 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4304                                      MachineOperand &RHS,
4305                                      MachineIRBuilder &MIRBuilder) const {
4306   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4307       {{AArch64::SUBSXri, AArch64::SUBSWri},
4308        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4309        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4310        {AArch64::ADDSXri, AArch64::ADDSWri},
4311        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4312   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4313 }
4314 
4315 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4316 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4317                                     MachineIRBuilder &MIRBuilder) const {
4318   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4319   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4320   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4321   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4322 }
4323 
4324 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4325 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4326                                     MachineIRBuilder &MIRBuilder) const {
4327   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4328   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4329   LLT Ty = MRI.getType(LHS.getReg());
4330   unsigned RegSize = Ty.getSizeInBits();
4331   bool Is32Bit = (RegSize == 32);
4332   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4333                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4334                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4335   // ANDS needs a logical immediate for its immediate form. Check if we can
4336   // fold one in.
4337   if (auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4338     int64_t Imm = ValAndVReg->Value.getSExtValue();
4339 
4340     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4341       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4342       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4343       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4344       return &*TstMI;
4345     }
4346   }
4347 
4348   if (auto Fns = selectLogicalShiftedRegister(RHS))
4349     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4350   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4351 }
4352 
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4353 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4354     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4355     MachineIRBuilder &MIRBuilder) const {
4356   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4357   assert(Predicate.isPredicate() && "Expected predicate?");
4358   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4359   LLT CmpTy = MRI.getType(LHS.getReg());
4360   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4361   unsigned Size = CmpTy.getSizeInBits();
4362   (void)Size;
4363   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4364   // Fold the compare into a cmn or tst if possible.
4365   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4366     return FoldCmp;
4367   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4368   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4369 }
4370 
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4371 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4372     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4373   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4374 #ifndef NDEBUG
4375   LLT Ty = MRI.getType(Dst);
4376   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4377          "Expected a 32-bit scalar register?");
4378 #endif
4379   const Register ZeroReg = AArch64::WZR;
4380   auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
4381     auto CSet =
4382         MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
4383             .addImm(getInvertedCondCode(CC));
4384     constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
4385     return &*CSet;
4386   };
4387 
4388   AArch64CC::CondCode CC1, CC2;
4389   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4390   if (CC2 == AArch64CC::AL)
4391     return EmitCSet(Dst, CC1);
4392 
4393   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4394   Register Def1Reg = MRI.createVirtualRegister(RC);
4395   Register Def2Reg = MRI.createVirtualRegister(RC);
4396   EmitCSet(Def1Reg, CC1);
4397   EmitCSet(Def2Reg, CC2);
4398   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4399   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4400   return &*OrMI;
4401 }
4402 
4403 MachineInstr *
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,Optional<CmpInst::Predicate> Pred) const4404 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4405                                           MachineIRBuilder &MIRBuilder,
4406                                           Optional<CmpInst::Predicate> Pred) const {
4407   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4408   LLT Ty = MRI.getType(LHS);
4409   if (Ty.isVector())
4410     return nullptr;
4411   unsigned OpSize = Ty.getSizeInBits();
4412   if (OpSize != 32 && OpSize != 64)
4413     return nullptr;
4414 
4415   // If this is a compare against +0.0, then we don't have
4416   // to explicitly materialize a constant.
4417   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4418   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4419 
4420   auto IsEqualityPred = [](CmpInst::Predicate P) {
4421     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4422            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4423   };
4424   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4425     // Try commutating the operands.
4426     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4427     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4428       ShouldUseImm = true;
4429       std::swap(LHS, RHS);
4430     }
4431   }
4432   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4433                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4434   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4435 
4436   // Partially build the compare. Decide if we need to add a use for the
4437   // third operand based off whether or not we're comparing against 0.0.
4438   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4439   if (!ShouldUseImm)
4440     CmpMI.addUse(RHS);
4441   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4442   return &*CmpMI;
4443 }
4444 
emitVectorConcat(Optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4445 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4446     Optional<Register> Dst, Register Op1, Register Op2,
4447     MachineIRBuilder &MIRBuilder) const {
4448   // We implement a vector concat by:
4449   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4450   // 2. Insert the upper vector into the destination's upper element
4451   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4452   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4453 
4454   const LLT Op1Ty = MRI.getType(Op1);
4455   const LLT Op2Ty = MRI.getType(Op2);
4456 
4457   if (Op1Ty != Op2Ty) {
4458     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4459     return nullptr;
4460   }
4461   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4462 
4463   if (Op1Ty.getSizeInBits() >= 128) {
4464     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4465     return nullptr;
4466   }
4467 
4468   // At the moment we just support 64 bit vector concats.
4469   if (Op1Ty.getSizeInBits() != 64) {
4470     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4471     return nullptr;
4472   }
4473 
4474   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4475   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4476   const TargetRegisterClass *DstRC =
4477       getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
4478 
4479   MachineInstr *WidenedOp1 =
4480       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4481   MachineInstr *WidenedOp2 =
4482       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4483   if (!WidenedOp1 || !WidenedOp2) {
4484     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4485     return nullptr;
4486   }
4487 
4488   // Now do the insert of the upper element.
4489   unsigned InsertOpc, InsSubRegIdx;
4490   std::tie(InsertOpc, InsSubRegIdx) =
4491       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4492 
4493   if (!Dst)
4494     Dst = MRI.createVirtualRegister(DstRC);
4495   auto InsElt =
4496       MIRBuilder
4497           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4498           .addImm(1) /* Lane index */
4499           .addUse(WidenedOp2->getOperand(0).getReg())
4500           .addImm(0);
4501   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4502   return &*InsElt;
4503 }
4504 
4505 MachineInstr *
emitCSetForICMP(Register DefReg,unsigned Pred,MachineIRBuilder & MIRBuilder,Register SrcReg) const4506 AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
4507                                             MachineIRBuilder &MIRBuilder,
4508                                             Register SrcReg) const {
4509   // CSINC increments the result when the predicate is false. Invert it.
4510   const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
4511       CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
4512   auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
4513                .addImm(InvCC);
4514   constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
4515   return &*I;
4516 }
4517 
4518 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4519 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4520                                            MachineOperand &LHS,
4521                                            MachineOperand &RHS,
4522                                            MachineIRBuilder &MIRBuilder) const {
4523   switch (Opcode) {
4524   default:
4525     llvm_unreachable("Unexpected opcode!");
4526   case TargetOpcode::G_SADDO:
4527     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4528   case TargetOpcode::G_UADDO:
4529     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4530   case TargetOpcode::G_SSUBO:
4531     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4532   case TargetOpcode::G_USUBO:
4533     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4534   }
4535 }
4536 
tryOptSelect(MachineInstr & I)4537 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
4538   MachineRegisterInfo &MRI = *MIB.getMRI();
4539   // We want to recognize this pattern:
4540   //
4541   // $z = G_FCMP pred, $x, $y
4542   // ...
4543   // $w = G_SELECT $z, $a, $b
4544   //
4545   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4546   // some copies/truncs in between.)
4547   //
4548   // If we see this, then we can emit something like this:
4549   //
4550   // fcmp $x, $y
4551   // fcsel $w, $a, $b, pred
4552   //
4553   // Rather than emitting both of the rather long sequences in the standard
4554   // G_FCMP/G_SELECT select methods.
4555 
4556   // First, check if the condition is defined by a compare.
4557   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4558   while (CondDef) {
4559     // We can only fold if all of the defs have one use.
4560     Register CondDefReg = CondDef->getOperand(0).getReg();
4561     if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4562       // Unless it's another select.
4563       for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4564         if (CondDef == &UI)
4565           continue;
4566         if (UI.getOpcode() != TargetOpcode::G_SELECT)
4567           return false;
4568       }
4569     }
4570 
4571     // We can skip over G_TRUNC since the condition is 1-bit.
4572     // Truncating/extending can have no impact on the value.
4573     unsigned Opc = CondDef->getOpcode();
4574     if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
4575       break;
4576 
4577     // Can't see past copies from physregs.
4578     if (Opc == TargetOpcode::COPY &&
4579         Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
4580       return false;
4581 
4582     CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
4583   }
4584 
4585   // Is the condition defined by a compare?
4586   if (!CondDef)
4587     return false;
4588 
4589   unsigned CondOpc = CondDef->getOpcode();
4590   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
4591     return false;
4592 
4593   AArch64CC::CondCode CondCode;
4594   if (CondOpc == TargetOpcode::G_ICMP) {
4595     auto Pred =
4596         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4597     CondCode = changeICMPPredToAArch64CC(Pred);
4598     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4599                        CondDef->getOperand(1), MIB);
4600   } else {
4601     // Get the condition code for the select.
4602     auto Pred =
4603         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4604     AArch64CC::CondCode CondCode2;
4605     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4606 
4607     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4608     // instructions to emit the comparison.
4609     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4610     // unnecessary.
4611     if (CondCode2 != AArch64CC::AL)
4612       return false;
4613 
4614     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4615                        CondDef->getOperand(3).getReg(), MIB)) {
4616       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4617       return false;
4618     }
4619   }
4620 
4621   // Emit the select.
4622   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4623              I.getOperand(3).getReg(), CondCode, MIB);
4624   I.eraseFromParent();
4625   return true;
4626 }
4627 
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4628 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4629     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4630     MachineIRBuilder &MIRBuilder) const {
4631   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
4632          "Unexpected MachineOperand");
4633   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4634   // We want to find this sort of thing:
4635   // x = G_SUB 0, y
4636   // G_ICMP z, x
4637   //
4638   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
4639   // e.g:
4640   //
4641   // cmn z, y
4642 
4643   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
4644   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
4645   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
4646   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
4647   // Given this:
4648   //
4649   // x = G_SUB 0, y
4650   // G_ICMP x, z
4651   //
4652   // Produce this:
4653   //
4654   // cmn y, z
4655   if (isCMN(LHSDef, P, MRI))
4656     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
4657 
4658   // Same idea here, but with the RHS of the compare instead:
4659   //
4660   // Given this:
4661   //
4662   // x = G_SUB 0, y
4663   // G_ICMP z, x
4664   //
4665   // Produce this:
4666   //
4667   // cmn z, y
4668   if (isCMN(RHSDef, P, MRI))
4669     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
4670 
4671   // Given this:
4672   //
4673   // z = G_AND x, y
4674   // G_ICMP z, 0
4675   //
4676   // Produce this if the compare is signed:
4677   //
4678   // tst x, y
4679   if (!CmpInst::isUnsigned(P) && LHSDef &&
4680       LHSDef->getOpcode() == TargetOpcode::G_AND) {
4681     // Make sure that the RHS is 0.
4682     auto ValAndVReg = getConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4683     if (!ValAndVReg || ValAndVReg->Value != 0)
4684       return nullptr;
4685 
4686     return emitTST(LHSDef->getOperand(1),
4687                    LHSDef->getOperand(2), MIRBuilder);
4688   }
4689 
4690   return nullptr;
4691 }
4692 
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)4693 bool AArch64InstructionSelector::selectShuffleVector(
4694     MachineInstr &I, MachineRegisterInfo &MRI) {
4695   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4696   Register Src1Reg = I.getOperand(1).getReg();
4697   const LLT Src1Ty = MRI.getType(Src1Reg);
4698   Register Src2Reg = I.getOperand(2).getReg();
4699   const LLT Src2Ty = MRI.getType(Src2Reg);
4700   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
4701 
4702   MachineBasicBlock &MBB = *I.getParent();
4703   MachineFunction &MF = *MBB.getParent();
4704   LLVMContext &Ctx = MF.getFunction().getContext();
4705 
4706   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
4707   // it's originated from a <1 x T> type. Those should have been lowered into
4708   // G_BUILD_VECTOR earlier.
4709   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
4710     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
4711     return false;
4712   }
4713 
4714   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
4715 
4716   SmallVector<Constant *, 64> CstIdxs;
4717   for (int Val : Mask) {
4718     // For now, any undef indexes we'll just assume to be 0. This should be
4719     // optimized in future, e.g. to select DUP etc.
4720     Val = Val < 0 ? 0 : Val;
4721     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4722       unsigned Offset = Byte + Val * BytesPerElt;
4723       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
4724     }
4725   }
4726 
4727   // Use a constant pool to load the index vector for TBL.
4728   Constant *CPVal = ConstantVector::get(CstIdxs);
4729   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
4730   if (!IndexLoad) {
4731     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
4732     return false;
4733   }
4734 
4735   if (DstTy.getSizeInBits() != 128) {
4736     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
4737     // This case can be done with TBL1.
4738     MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
4739     if (!Concat) {
4740       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
4741       return false;
4742     }
4743 
4744     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
4745     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
4746                                    IndexLoad->getOperand(0).getReg(), MIB);
4747 
4748     auto TBL1 = MIB.buildInstr(
4749         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
4750         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
4751     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
4752 
4753     auto Copy =
4754         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
4755             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
4756     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
4757     I.eraseFromParent();
4758     return true;
4759   }
4760 
4761   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
4762   // Q registers for regalloc.
4763   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
4764   auto RegSeq = createQTuple(Regs, MIB);
4765   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
4766                              {RegSeq, IndexLoad->getOperand(0)});
4767   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
4768   I.eraseFromParent();
4769   return true;
4770 }
4771 
emitLaneInsert(Optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const4772 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
4773     Optional<Register> DstReg, Register SrcReg, Register EltReg,
4774     unsigned LaneIdx, const RegisterBank &RB,
4775     MachineIRBuilder &MIRBuilder) const {
4776   MachineInstr *InsElt = nullptr;
4777   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4778   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4779 
4780   // Create a register to define with the insert if one wasn't passed in.
4781   if (!DstReg)
4782     DstReg = MRI.createVirtualRegister(DstRC);
4783 
4784   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
4785   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
4786 
4787   if (RB.getID() == AArch64::FPRRegBankID) {
4788     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
4789     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4790                  .addImm(LaneIdx)
4791                  .addUse(InsSub->getOperand(0).getReg())
4792                  .addImm(0);
4793   } else {
4794     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4795                  .addImm(LaneIdx)
4796                  .addUse(EltReg);
4797   }
4798 
4799   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4800   return InsElt;
4801 }
4802 
selectInsertElt(MachineInstr & I,MachineRegisterInfo & MRI)4803 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
4804                                                  MachineRegisterInfo &MRI) {
4805   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
4806 
4807   // Get information on the destination.
4808   Register DstReg = I.getOperand(0).getReg();
4809   const LLT DstTy = MRI.getType(DstReg);
4810   unsigned VecSize = DstTy.getSizeInBits();
4811 
4812   // Get information on the element we want to insert into the destination.
4813   Register EltReg = I.getOperand(2).getReg();
4814   const LLT EltTy = MRI.getType(EltReg);
4815   unsigned EltSize = EltTy.getSizeInBits();
4816   if (EltSize < 16 || EltSize > 64)
4817     return false; // Don't support all element types yet.
4818 
4819   // Find the definition of the index. Bail out if it's not defined by a
4820   // G_CONSTANT.
4821   Register IdxReg = I.getOperand(3).getReg();
4822   auto VRegAndVal = getConstantVRegValWithLookThrough(IdxReg, MRI);
4823   if (!VRegAndVal)
4824     return false;
4825   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4826 
4827   // Perform the lane insert.
4828   Register SrcReg = I.getOperand(1).getReg();
4829   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
4830 
4831   if (VecSize < 128) {
4832     // If the vector we're inserting into is smaller than 128 bits, widen it
4833     // to 128 to do the insert.
4834     MachineInstr *ScalarToVec =
4835         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
4836     if (!ScalarToVec)
4837       return false;
4838     SrcReg = ScalarToVec->getOperand(0).getReg();
4839   }
4840 
4841   // Create an insert into a new FPR128 register.
4842   // Note that if our vector is already 128 bits, we end up emitting an extra
4843   // register.
4844   MachineInstr *InsMI =
4845       emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
4846 
4847   if (VecSize < 128) {
4848     // If we had to widen to perform the insert, then we have to demote back to
4849     // the original size to get the result we want.
4850     Register DemoteVec = InsMI->getOperand(0).getReg();
4851     const TargetRegisterClass *RC =
4852         getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
4853     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4854       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4855       return false;
4856     }
4857     unsigned SubReg = 0;
4858     if (!getSubRegForClass(RC, TRI, SubReg))
4859       return false;
4860     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
4861       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
4862                         << "\n");
4863       return false;
4864     }
4865     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
4866         .addReg(DemoteVec, 0, SubReg);
4867     RBI.constrainGenericRegister(DstReg, *RC, MRI);
4868   } else {
4869     // No widening needed.
4870     InsMI->getOperand(0).setReg(DstReg);
4871     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
4872   }
4873 
4874   I.eraseFromParent();
4875   return true;
4876 }
4877 
4878 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)4879 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
4880                                                MachineIRBuilder &MIRBuilder,
4881                                                MachineRegisterInfo &MRI) {
4882   LLT DstTy = MRI.getType(Dst);
4883   unsigned DstSize = DstTy.getSizeInBits();
4884   if (CV->isNullValue()) {
4885     if (DstSize == 128) {
4886       auto Mov =
4887           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
4888       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
4889       return &*Mov;
4890     }
4891 
4892     if (DstSize == 64) {
4893       auto Mov =
4894           MIRBuilder
4895               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
4896               .addImm(0);
4897       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
4898                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
4899       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
4900       return &*Copy;
4901     }
4902   }
4903 
4904   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
4905   if (!CPLoad) {
4906     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
4907     return nullptr;
4908   }
4909 
4910   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
4911   RBI.constrainGenericRegister(
4912       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
4913   return &*Copy;
4914 }
4915 
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)4916 bool AArch64InstructionSelector::tryOptConstantBuildVec(
4917     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
4918   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4919   unsigned DstSize = DstTy.getSizeInBits();
4920   assert(DstSize <= 128 && "Unexpected build_vec type!");
4921   if (DstSize < 32)
4922     return false;
4923   // Check if we're building a constant vector, in which case we want to
4924   // generate a constant pool load instead of a vector insert sequence.
4925   SmallVector<Constant *, 16> Csts;
4926   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
4927     // Try to find G_CONSTANT or G_FCONSTANT
4928     auto *OpMI =
4929         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
4930     if (OpMI)
4931       Csts.emplace_back(
4932           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
4933     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
4934                                   I.getOperand(Idx).getReg(), MRI)))
4935       Csts.emplace_back(
4936           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
4937     else
4938       return false;
4939   }
4940   Constant *CV = ConstantVector::get(Csts);
4941   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
4942     return false;
4943   I.eraseFromParent();
4944   return true;
4945 }
4946 
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)4947 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
4948                                                    MachineRegisterInfo &MRI) {
4949   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
4950   // Until we port more of the optimized selections, for now just use a vector
4951   // insert sequence.
4952   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4953   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
4954   unsigned EltSize = EltTy.getSizeInBits();
4955 
4956   if (tryOptConstantBuildVec(I, DstTy, MRI))
4957     return true;
4958   if (EltSize < 16 || EltSize > 64)
4959     return false; // Don't support all element types yet.
4960   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
4961 
4962   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4963   MachineInstr *ScalarToVec =
4964       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
4965                          I.getOperand(1).getReg(), MIB);
4966   if (!ScalarToVec)
4967     return false;
4968 
4969   Register DstVec = ScalarToVec->getOperand(0).getReg();
4970   unsigned DstSize = DstTy.getSizeInBits();
4971 
4972   // Keep track of the last MI we inserted. Later on, we might be able to save
4973   // a copy using it.
4974   MachineInstr *PrevMI = nullptr;
4975   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
4976     // Note that if we don't do a subregister copy, we can end up making an
4977     // extra register.
4978     PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
4979                               MIB);
4980     DstVec = PrevMI->getOperand(0).getReg();
4981   }
4982 
4983   // If DstTy's size in bits is less than 128, then emit a subregister copy
4984   // from DstVec to the last register we've defined.
4985   if (DstSize < 128) {
4986     // Force this to be FPR using the destination vector.
4987     const TargetRegisterClass *RC =
4988         getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
4989     if (!RC)
4990       return false;
4991     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4992       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4993       return false;
4994     }
4995 
4996     unsigned SubReg = 0;
4997     if (!getSubRegForClass(RC, TRI, SubReg))
4998       return false;
4999     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5000       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5001                         << "\n");
5002       return false;
5003     }
5004 
5005     Register Reg = MRI.createVirtualRegister(RC);
5006     Register DstReg = I.getOperand(0).getReg();
5007 
5008     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5009     MachineOperand &RegOp = I.getOperand(1);
5010     RegOp.setReg(Reg);
5011     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5012   } else {
5013     // We don't need a subregister copy. Save a copy by re-using the
5014     // destination register on the final insert.
5015     assert(PrevMI && "PrevMI was null?");
5016     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5017     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5018   }
5019 
5020   I.eraseFromParent();
5021   return true;
5022 }
5023 
5024 /// Helper function to find an intrinsic ID on an a MachineInstr. Returns the
5025 /// ID if it exists, and 0 otherwise.
findIntrinsicID(MachineInstr & I)5026 static unsigned findIntrinsicID(MachineInstr &I) {
5027   auto IntrinOp = find_if(I.operands(), [&](const MachineOperand &Op) {
5028     return Op.isIntrinsicID();
5029   });
5030   if (IntrinOp == I.operands_end())
5031     return 0;
5032   return IntrinOp->getIntrinsicID();
5033 }
5034 
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)5035 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5036     MachineInstr &I, MachineRegisterInfo &MRI) {
5037   // Find the intrinsic ID.
5038   unsigned IntrinID = findIntrinsicID(I);
5039   if (!IntrinID)
5040     return false;
5041 
5042   // Select the instruction.
5043   switch (IntrinID) {
5044   default:
5045     return false;
5046   case Intrinsic::aarch64_ldxp:
5047   case Intrinsic::aarch64_ldaxp: {
5048     auto NewI = MIB.buildInstr(
5049         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5050         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5051         {I.getOperand(3)});
5052     NewI.cloneMemRefs(I);
5053     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5054     break;
5055   }
5056   case Intrinsic::trap:
5057     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5058     break;
5059   case Intrinsic::debugtrap:
5060     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5061     break;
5062   case Intrinsic::ubsantrap:
5063     MIB.buildInstr(AArch64::BRK, {}, {})
5064         .addImm(I.getOperand(1).getImm() | ('U' << 8));
5065     break;
5066   case Intrinsic::aarch64_neon_st2: {
5067     Register Src1 = I.getOperand(1).getReg();
5068     Register Src2 = I.getOperand(2).getReg();
5069     Register Ptr = I.getOperand(3).getReg();
5070     LLT Ty = MRI.getType(Src1);
5071     const LLT S8 = LLT::scalar(8);
5072     const LLT S16 = LLT::scalar(16);
5073     const LLT S32 = LLT::scalar(32);
5074     const LLT S64 = LLT::scalar(64);
5075     const LLT P0 = LLT::pointer(0, 64);
5076     unsigned Opc;
5077     if (Ty == LLT::fixed_vector(8, S8))
5078       Opc = AArch64::ST2Twov8b;
5079     else if (Ty == LLT::fixed_vector(16, S8))
5080       Opc = AArch64::ST2Twov16b;
5081     else if (Ty == LLT::fixed_vector(4, S16))
5082       Opc = AArch64::ST2Twov4h;
5083     else if (Ty == LLT::fixed_vector(8, S16))
5084       Opc = AArch64::ST2Twov8h;
5085     else if (Ty == LLT::fixed_vector(2, S32))
5086       Opc = AArch64::ST2Twov2s;
5087     else if (Ty == LLT::fixed_vector(4, S32))
5088       Opc = AArch64::ST2Twov4s;
5089     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5090       Opc = AArch64::ST2Twov2d;
5091     else if (Ty == S64 || Ty == P0)
5092       Opc = AArch64::ST1Twov1d;
5093     else
5094       llvm_unreachable("Unexpected type for st2!");
5095     SmallVector<Register, 2> Regs = {Src1, Src2};
5096     Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5097                                                : createDTuple(Regs, MIB);
5098     auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5099     Store.cloneMemRefs(I);
5100     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5101     break;
5102   }
5103   }
5104 
5105   I.eraseFromParent();
5106   return true;
5107 }
5108 
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)5109 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5110                                                  MachineRegisterInfo &MRI) {
5111   unsigned IntrinID = findIntrinsicID(I);
5112   if (!IntrinID)
5113     return false;
5114 
5115   switch (IntrinID) {
5116   default:
5117     break;
5118   case Intrinsic::aarch64_crypto_sha1h: {
5119     Register DstReg = I.getOperand(0).getReg();
5120     Register SrcReg = I.getOperand(2).getReg();
5121 
5122     // FIXME: Should this be an assert?
5123     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5124         MRI.getType(SrcReg).getSizeInBits() != 32)
5125       return false;
5126 
5127     // The operation has to happen on FPRs. Set up some new FPR registers for
5128     // the source and destination if they are on GPRs.
5129     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5130       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5131       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5132 
5133       // Make sure the copy ends up getting constrained properly.
5134       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5135                                    AArch64::GPR32RegClass, MRI);
5136     }
5137 
5138     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5139       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5140 
5141     // Actually insert the instruction.
5142     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5143     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5144 
5145     // Did we create a new register for the destination?
5146     if (DstReg != I.getOperand(0).getReg()) {
5147       // Yep. Copy the result of the instruction back into the original
5148       // destination.
5149       MIB.buildCopy({I.getOperand(0)}, {DstReg});
5150       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5151                                    AArch64::GPR32RegClass, MRI);
5152     }
5153 
5154     I.eraseFromParent();
5155     return true;
5156   }
5157   case Intrinsic::frameaddress:
5158   case Intrinsic::returnaddress: {
5159     MachineFunction &MF = *I.getParent()->getParent();
5160     MachineFrameInfo &MFI = MF.getFrameInfo();
5161 
5162     unsigned Depth = I.getOperand(2).getImm();
5163     Register DstReg = I.getOperand(0).getReg();
5164     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5165 
5166     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5167       if (!MFReturnAddr) {
5168         // Insert the copy from LR/X30 into the entry block, before it can be
5169         // clobbered by anything.
5170         MFI.setReturnAddressIsTaken(true);
5171         MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
5172                                                 AArch64::GPR64RegClass);
5173       }
5174 
5175       if (STI.hasPAuth()) {
5176         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5177       } else {
5178         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5179         MIB.buildInstr(AArch64::XPACLRI);
5180         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5181       }
5182 
5183       I.eraseFromParent();
5184       return true;
5185     }
5186 
5187     MFI.setFrameAddressIsTaken(true);
5188     Register FrameAddr(AArch64::FP);
5189     while (Depth--) {
5190       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5191       auto Ldr =
5192           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5193       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5194       FrameAddr = NextFrame;
5195     }
5196 
5197     if (IntrinID == Intrinsic::frameaddress)
5198       MIB.buildCopy({DstReg}, {FrameAddr});
5199     else {
5200       MFI.setReturnAddressIsTaken(true);
5201 
5202       if (STI.hasPAuth()) {
5203         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5204         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5205         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5206       } else {
5207         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5208             .addImm(1);
5209         MIB.buildInstr(AArch64::XPACLRI);
5210         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5211       }
5212     }
5213 
5214     I.eraseFromParent();
5215     return true;
5216   }
5217   case Intrinsic::swift_async_context_addr:
5218     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5219                               {Register(AArch64::FP)})
5220                    .addImm(8)
5221                    .addImm(0);
5222     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5223 
5224     MF->getFrameInfo().setFrameAddressIsTaken(true);
5225     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5226     I.eraseFromParent();
5227     return true;
5228   }
5229   return false;
5230 }
5231 
5232 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const5233 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5234   auto MaybeImmed = getImmedFromMO(Root);
5235   if (MaybeImmed == None || *MaybeImmed > 31)
5236     return None;
5237   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5238   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5239 }
5240 
5241 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const5242 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5243   auto MaybeImmed = getImmedFromMO(Root);
5244   if (MaybeImmed == None || *MaybeImmed > 31)
5245     return None;
5246   uint64_t Enc = 31 - *MaybeImmed;
5247   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5248 }
5249 
5250 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const5251 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5252   auto MaybeImmed = getImmedFromMO(Root);
5253   if (MaybeImmed == None || *MaybeImmed > 63)
5254     return None;
5255   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5256   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5257 }
5258 
5259 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const5260 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5261   auto MaybeImmed = getImmedFromMO(Root);
5262   if (MaybeImmed == None || *MaybeImmed > 63)
5263     return None;
5264   uint64_t Enc = 63 - *MaybeImmed;
5265   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5266 }
5267 
5268 /// Helper to select an immediate value that can be represented as a 12-bit
5269 /// value shifted left by either 0 or 12. If it is possible to do so, return
5270 /// the immediate and shift value. If not, return None.
5271 ///
5272 /// Used by selectArithImmed and selectNegArithImmed.
5273 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const5274 AArch64InstructionSelector::select12BitValueWithLeftShift(
5275     uint64_t Immed) const {
5276   unsigned ShiftAmt;
5277   if (Immed >> 12 == 0) {
5278     ShiftAmt = 0;
5279   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5280     ShiftAmt = 12;
5281     Immed = Immed >> 12;
5282   } else
5283     return None;
5284 
5285   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5286   return {{
5287       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5288       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5289   }};
5290 }
5291 
5292 /// SelectArithImmed - Select an immediate value that can be represented as
5293 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
5294 /// Val set to the 12-bit value and Shift set to the shifter operand.
5295 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const5296 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5297   // This function is called from the addsub_shifted_imm ComplexPattern,
5298   // which lists [imm] as the list of opcode it's interested in, however
5299   // we still need to check whether the operand is actually an immediate
5300   // here because the ComplexPattern opcode list is only used in
5301   // root-level opcode matching.
5302   auto MaybeImmed = getImmedFromMO(Root);
5303   if (MaybeImmed == None)
5304     return None;
5305   return select12BitValueWithLeftShift(*MaybeImmed);
5306 }
5307 
5308 /// SelectNegArithImmed - As above, but negates the value before trying to
5309 /// select it.
5310 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const5311 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5312   // We need a register here, because we need to know if we have a 64 or 32
5313   // bit immediate.
5314   if (!Root.isReg())
5315     return None;
5316   auto MaybeImmed = getImmedFromMO(Root);
5317   if (MaybeImmed == None)
5318     return None;
5319   uint64_t Immed = *MaybeImmed;
5320 
5321   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5322   // have the opposite effect on the C flag, so this pattern mustn't match under
5323   // those circumstances.
5324   if (Immed == 0)
5325     return None;
5326 
5327   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5328   // the root.
5329   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5330   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5331     Immed = ~((uint32_t)Immed) + 1;
5332   else
5333     Immed = ~Immed + 1ULL;
5334 
5335   if (Immed & 0xFFFFFFFFFF000000ULL)
5336     return None;
5337 
5338   Immed &= 0xFFFFFFULL;
5339   return select12BitValueWithLeftShift(Immed);
5340 }
5341 
5342 /// Return true if it is worth folding MI into an extended register. That is,
5343 /// if it's safe to pull it into the addressing mode of a load or store as a
5344 /// shift.
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI) const5345 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
5346     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
5347   // Always fold if there is one use, or if we're optimizing for size.
5348   Register DefReg = MI.getOperand(0).getReg();
5349   if (MRI.hasOneNonDBGUse(DefReg) ||
5350       MI.getParent()->getParent()->getFunction().hasOptSize())
5351     return true;
5352 
5353   // It's better to avoid folding and recomputing shifts when we don't have a
5354   // fastpath.
5355   if (!STI.hasLSLFast())
5356     return false;
5357 
5358   // We have a fastpath, so folding a shift in and potentially computing it
5359   // many times may be beneficial. Check if this is only used in memory ops.
5360   // If it is, then we should fold.
5361   return all_of(MRI.use_nodbg_instructions(DefReg),
5362                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
5363 }
5364 
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)5365 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
5366   switch (Type) {
5367   case AArch64_AM::SXTB:
5368   case AArch64_AM::SXTH:
5369   case AArch64_AM::SXTW:
5370     return true;
5371   default:
5372     return false;
5373   }
5374 }
5375 
5376 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const5377 AArch64InstructionSelector::selectExtendedSHL(
5378     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
5379     unsigned SizeInBytes, bool WantsExt) const {
5380   assert(Base.isReg() && "Expected base to be a register operand");
5381   assert(Offset.isReg() && "Expected offset to be a register operand");
5382 
5383   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5384   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
5385   if (!OffsetInst)
5386     return None;
5387 
5388   unsigned OffsetOpc = OffsetInst->getOpcode();
5389   bool LookedThroughZExt = false;
5390   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
5391     // Try to look through a ZEXT.
5392     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
5393       return None;
5394 
5395     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
5396     OffsetOpc = OffsetInst->getOpcode();
5397     LookedThroughZExt = true;
5398 
5399     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
5400       return None;
5401   }
5402   // Make sure that the memory op is a valid size.
5403   int64_t LegalShiftVal = Log2_32(SizeInBytes);
5404   if (LegalShiftVal == 0)
5405     return None;
5406   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5407     return None;
5408 
5409   // Now, try to find the specific G_CONSTANT. Start by assuming that the
5410   // register we will offset is the LHS, and the register containing the
5411   // constant is the RHS.
5412   Register OffsetReg = OffsetInst->getOperand(1).getReg();
5413   Register ConstantReg = OffsetInst->getOperand(2).getReg();
5414   auto ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5415   if (!ValAndVReg) {
5416     // We didn't get a constant on the RHS. If the opcode is a shift, then
5417     // we're done.
5418     if (OffsetOpc == TargetOpcode::G_SHL)
5419       return None;
5420 
5421     // If we have a G_MUL, we can use either register. Try looking at the RHS.
5422     std::swap(OffsetReg, ConstantReg);
5423     ValAndVReg = getConstantVRegValWithLookThrough(ConstantReg, MRI);
5424     if (!ValAndVReg)
5425       return None;
5426   }
5427 
5428   // The value must fit into 3 bits, and must be positive. Make sure that is
5429   // true.
5430   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
5431 
5432   // Since we're going to pull this into a shift, the constant value must be
5433   // a power of 2. If we got a multiply, then we need to check this.
5434   if (OffsetOpc == TargetOpcode::G_MUL) {
5435     if (!isPowerOf2_32(ImmVal))
5436       return None;
5437 
5438     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
5439     ImmVal = Log2_32(ImmVal);
5440   }
5441 
5442   if ((ImmVal & 0x7) != ImmVal)
5443     return None;
5444 
5445   // We are only allowed to shift by LegalShiftVal. This shift value is built
5446   // into the instruction, so we can't just use whatever we want.
5447   if (ImmVal != LegalShiftVal)
5448     return None;
5449 
5450   unsigned SignExtend = 0;
5451   if (WantsExt) {
5452     // Check if the offset is defined by an extend, unless we looked through a
5453     // G_ZEXT earlier.
5454     if (!LookedThroughZExt) {
5455       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
5456       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
5457       if (Ext == AArch64_AM::InvalidShiftExtend)
5458         return None;
5459 
5460       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
5461       // We only support SXTW for signed extension here.
5462       if (SignExtend && Ext != AArch64_AM::SXTW)
5463         return None;
5464       OffsetReg = ExtInst->getOperand(1).getReg();
5465     }
5466 
5467     // Need a 32-bit wide register here.
5468     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
5469     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
5470   }
5471 
5472   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
5473   // offset. Signify that we are shifting by setting the shift flag to 1.
5474   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
5475            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
5476            [=](MachineInstrBuilder &MIB) {
5477              // Need to add both immediates here to make sure that they are both
5478              // added to the instruction.
5479              MIB.addImm(SignExtend);
5480              MIB.addImm(1);
5481            }}};
5482 }
5483 
5484 /// This is used for computing addresses like this:
5485 ///
5486 /// ldr x1, [x2, x3, lsl #3]
5487 ///
5488 /// Where x2 is the base register, and x3 is an offset register. The shift-left
5489 /// is a constant value specific to this load instruction. That is, we'll never
5490 /// see anything other than a 3 here (which corresponds to the size of the
5491 /// element being loaded.)
5492 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const5493 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
5494     MachineOperand &Root, unsigned SizeInBytes) const {
5495   if (!Root.isReg())
5496     return None;
5497   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5498 
5499   // We want to find something like this:
5500   //
5501   // val = G_CONSTANT LegalShiftVal
5502   // shift = G_SHL off_reg val
5503   // ptr = G_PTR_ADD base_reg shift
5504   // x = G_LOAD ptr
5505   //
5506   // And fold it into this addressing mode:
5507   //
5508   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
5509 
5510   // Check if we can find the G_PTR_ADD.
5511   MachineInstr *PtrAdd =
5512       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5513   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5514     return None;
5515 
5516   // Now, try to match an opcode which will match our specific offset.
5517   // We want a G_SHL or a G_MUL.
5518   MachineInstr *OffsetInst =
5519       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
5520   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
5521                            OffsetInst->getOperand(0), SizeInBytes,
5522                            /*WantsExt=*/false);
5523 }
5524 
5525 /// This is used for computing addresses like this:
5526 ///
5527 /// ldr x1, [x2, x3]
5528 ///
5529 /// Where x2 is the base register, and x3 is an offset register.
5530 ///
5531 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
5532 /// this will do so. Otherwise, it will return None.
5533 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const5534 AArch64InstructionSelector::selectAddrModeRegisterOffset(
5535     MachineOperand &Root) const {
5536   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5537 
5538   // We need a GEP.
5539   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
5540   if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
5541     return None;
5542 
5543   // If this is used more than once, let's not bother folding.
5544   // TODO: Check if they are memory ops. If they are, then we can still fold
5545   // without having to recompute anything.
5546   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
5547     return None;
5548 
5549   // Base is the GEP's LHS, offset is its RHS.
5550   return {{[=](MachineInstrBuilder &MIB) {
5551              MIB.addUse(Gep->getOperand(1).getReg());
5552            },
5553            [=](MachineInstrBuilder &MIB) {
5554              MIB.addUse(Gep->getOperand(2).getReg());
5555            },
5556            [=](MachineInstrBuilder &MIB) {
5557              // Need to add both immediates here to make sure that they are both
5558              // added to the instruction.
5559              MIB.addImm(0);
5560              MIB.addImm(0);
5561            }}};
5562 }
5563 
5564 /// This is intended to be equivalent to selectAddrModeXRO in
5565 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
5566 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const5567 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
5568                                               unsigned SizeInBytes) const {
5569   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5570   if (!Root.isReg())
5571     return None;
5572   MachineInstr *PtrAdd =
5573       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5574   if (!PtrAdd)
5575     return None;
5576 
5577   // Check for an immediates which cannot be encoded in the [base + imm]
5578   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
5579   // end up with code like:
5580   //
5581   // mov x0, wide
5582   // add x1 base, x0
5583   // ldr x2, [x1, x0]
5584   //
5585   // In this situation, we can use the [base, xreg] addressing mode to save an
5586   // add/sub:
5587   //
5588   // mov x0, wide
5589   // ldr x2, [base, x0]
5590   auto ValAndVReg =
5591       getConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
5592   if (ValAndVReg) {
5593     unsigned Scale = Log2_32(SizeInBytes);
5594     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
5595 
5596     // Skip immediates that can be selected in the load/store addresing
5597     // mode.
5598     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
5599         ImmOff < (0x1000 << Scale))
5600       return None;
5601 
5602     // Helper lambda to decide whether or not it is preferable to emit an add.
5603     auto isPreferredADD = [](int64_t ImmOff) {
5604       // Constants in [0x0, 0xfff] can be encoded in an add.
5605       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
5606         return true;
5607 
5608       // Can it be encoded in an add lsl #12?
5609       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
5610         return false;
5611 
5612       // It can be encoded in an add lsl #12, but we may not want to. If it is
5613       // possible to select this as a single movz, then prefer that. A single
5614       // movz is faster than an add with a shift.
5615       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
5616              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
5617     };
5618 
5619     // If the immediate can be encoded in a single add/sub, then bail out.
5620     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
5621       return None;
5622   }
5623 
5624   // Try to fold shifts into the addressing mode.
5625   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
5626   if (AddrModeFns)
5627     return AddrModeFns;
5628 
5629   // If that doesn't work, see if it's possible to fold in registers from
5630   // a GEP.
5631   return selectAddrModeRegisterOffset(Root);
5632 }
5633 
5634 /// This is used for computing addresses like this:
5635 ///
5636 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
5637 ///
5638 /// Where we have a 64-bit base register, a 32-bit offset register, and an
5639 /// extend (which may or may not be signed).
5640 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const5641 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
5642                                               unsigned SizeInBytes) const {
5643   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5644 
5645   MachineInstr *PtrAdd =
5646       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5647   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5648     return None;
5649 
5650   MachineOperand &LHS = PtrAdd->getOperand(1);
5651   MachineOperand &RHS = PtrAdd->getOperand(2);
5652   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
5653 
5654   // The first case is the same as selectAddrModeXRO, except we need an extend.
5655   // In this case, we try to find a shift and extend, and fold them into the
5656   // addressing mode.
5657   //
5658   // E.g.
5659   //
5660   // off_reg = G_Z/S/ANYEXT ext_reg
5661   // val = G_CONSTANT LegalShiftVal
5662   // shift = G_SHL off_reg val
5663   // ptr = G_PTR_ADD base_reg shift
5664   // x = G_LOAD ptr
5665   //
5666   // In this case we can get a load like this:
5667   //
5668   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
5669   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
5670                                        SizeInBytes, /*WantsExt=*/true);
5671   if (ExtendedShl)
5672     return ExtendedShl;
5673 
5674   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
5675   //
5676   // e.g.
5677   // ldr something, [base_reg, ext_reg, sxtw]
5678   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5679     return None;
5680 
5681   // Check if this is an extend. We'll get an extend type if it is.
5682   AArch64_AM::ShiftExtendType Ext =
5683       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
5684   if (Ext == AArch64_AM::InvalidShiftExtend)
5685     return None;
5686 
5687   // Need a 32-bit wide register.
5688   MachineIRBuilder MIB(*PtrAdd);
5689   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
5690                                        AArch64::GPR32RegClass, MIB);
5691   unsigned SignExtend = Ext == AArch64_AM::SXTW;
5692 
5693   // Base is LHS, offset is ExtReg.
5694   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
5695            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
5696            [=](MachineInstrBuilder &MIB) {
5697              MIB.addImm(SignExtend);
5698              MIB.addImm(0);
5699            }}};
5700 }
5701 
5702 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
5703 /// should only match when there is an offset that is not valid for a scaled
5704 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
5705 /// memory reference, which is needed here to know what is valid for a scaled
5706 /// immediate.
5707 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const5708 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
5709                                                    unsigned Size) const {
5710   MachineRegisterInfo &MRI =
5711       Root.getParent()->getParent()->getParent()->getRegInfo();
5712 
5713   if (!Root.isReg())
5714     return None;
5715 
5716   if (!isBaseWithConstantOffset(Root, MRI))
5717     return None;
5718 
5719   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5720   if (!RootDef)
5721     return None;
5722 
5723   MachineOperand &OffImm = RootDef->getOperand(2);
5724   if (!OffImm.isReg())
5725     return None;
5726   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
5727   if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
5728     return None;
5729   int64_t RHSC;
5730   MachineOperand &RHSOp1 = RHS->getOperand(1);
5731   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
5732     return None;
5733   RHSC = RHSOp1.getCImm()->getSExtValue();
5734 
5735   // If the offset is valid as a scaled immediate, don't match here.
5736   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
5737     return None;
5738   if (RHSC >= -256 && RHSC < 256) {
5739     MachineOperand &Base = RootDef->getOperand(1);
5740     return {{
5741         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
5742         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
5743     }};
5744   }
5745   return None;
5746 }
5747 
5748 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const5749 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
5750                                                  unsigned Size,
5751                                                  MachineRegisterInfo &MRI) const {
5752   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
5753     return None;
5754   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
5755   if (Adrp.getOpcode() != AArch64::ADRP)
5756     return None;
5757 
5758   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
5759   auto Offset = Adrp.getOperand(1).getOffset();
5760   if (Offset % Size != 0)
5761     return None;
5762 
5763   auto GV = Adrp.getOperand(1).getGlobal();
5764   if (GV->isThreadLocal())
5765     return None;
5766 
5767   auto &MF = *RootDef.getParent()->getParent();
5768   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
5769     return None;
5770 
5771   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
5772   MachineIRBuilder MIRBuilder(RootDef);
5773   Register AdrpReg = Adrp.getOperand(0).getReg();
5774   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
5775            [=](MachineInstrBuilder &MIB) {
5776              MIB.addGlobalAddress(GV, Offset,
5777                                   OpFlags | AArch64II::MO_PAGEOFF |
5778                                       AArch64II::MO_NC);
5779            }}};
5780 }
5781 
5782 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
5783 /// "Size" argument is the size in bytes of the memory reference, which
5784 /// determines the scale.
5785 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const5786 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
5787                                                   unsigned Size) const {
5788   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
5789   MachineRegisterInfo &MRI = MF.getRegInfo();
5790 
5791   if (!Root.isReg())
5792     return None;
5793 
5794   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5795   if (!RootDef)
5796     return None;
5797 
5798   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
5799     return {{
5800         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
5801         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5802     }};
5803   }
5804 
5805   CodeModel::Model CM = MF.getTarget().getCodeModel();
5806   // Check if we can fold in the ADD of small code model ADRP + ADD address.
5807   if (CM == CodeModel::Small) {
5808     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
5809     if (OpFns)
5810       return OpFns;
5811   }
5812 
5813   if (isBaseWithConstantOffset(Root, MRI)) {
5814     MachineOperand &LHS = RootDef->getOperand(1);
5815     MachineOperand &RHS = RootDef->getOperand(2);
5816     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
5817     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
5818     if (LHSDef && RHSDef) {
5819       int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
5820       unsigned Scale = Log2_32(Size);
5821       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
5822         if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
5823           return {{
5824               [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
5825               [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5826           }};
5827 
5828         return {{
5829             [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
5830             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
5831         }};
5832       }
5833     }
5834   }
5835 
5836   // Before falling back to our general case, check if the unscaled
5837   // instructions can handle this. If so, that's preferable.
5838   if (selectAddrModeUnscaled(Root, Size).hasValue())
5839     return None;
5840 
5841   return {{
5842       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
5843       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
5844   }};
5845 }
5846 
5847 /// Given a shift instruction, return the correct shift type for that
5848 /// instruction.
getShiftTypeForInst(MachineInstr & MI)5849 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
5850   // TODO: Handle AArch64_AM::ROR
5851   switch (MI.getOpcode()) {
5852   default:
5853     return AArch64_AM::InvalidShiftExtend;
5854   case TargetOpcode::G_SHL:
5855     return AArch64_AM::LSL;
5856   case TargetOpcode::G_LSHR:
5857     return AArch64_AM::LSR;
5858   case TargetOpcode::G_ASHR:
5859     return AArch64_AM::ASR;
5860   }
5861 }
5862 
5863 /// Select a "shifted register" operand. If the value is not shifted, set the
5864 /// shift operand to a default value of "lsl 0".
5865 ///
5866 /// TODO: Allow shifted register to be rotated in logical instructions.
5867 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root) const5868 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
5869   if (!Root.isReg())
5870     return None;
5871   MachineRegisterInfo &MRI =
5872       Root.getParent()->getParent()->getParent()->getRegInfo();
5873 
5874   // Check if the operand is defined by an instruction which corresponds to
5875   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
5876   //
5877   // TODO: Handle AArch64_AM::ROR for logical instructions.
5878   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
5879   if (!ShiftInst)
5880     return None;
5881   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
5882   if (ShType == AArch64_AM::InvalidShiftExtend)
5883     return None;
5884   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
5885     return None;
5886 
5887   // Need an immediate on the RHS.
5888   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
5889   auto Immed = getImmedFromMO(ShiftRHS);
5890   if (!Immed)
5891     return None;
5892 
5893   // We have something that we can fold. Fold in the shift's LHS and RHS into
5894   // the instruction.
5895   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
5896   Register ShiftReg = ShiftLHS.getReg();
5897 
5898   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
5899   unsigned Val = *Immed & (NumBits - 1);
5900   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
5901 
5902   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
5903            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
5904 }
5905 
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const5906 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
5907     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
5908   unsigned Opc = MI.getOpcode();
5909 
5910   // Handle explicit extend instructions first.
5911   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
5912     unsigned Size;
5913     if (Opc == TargetOpcode::G_SEXT)
5914       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5915     else
5916       Size = MI.getOperand(2).getImm();
5917     assert(Size != 64 && "Extend from 64 bits?");
5918     switch (Size) {
5919     case 8:
5920       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
5921     case 16:
5922       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
5923     case 32:
5924       return AArch64_AM::SXTW;
5925     default:
5926       return AArch64_AM::InvalidShiftExtend;
5927     }
5928   }
5929 
5930   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
5931     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
5932     assert(Size != 64 && "Extend from 64 bits?");
5933     switch (Size) {
5934     case 8:
5935       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
5936     case 16:
5937       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
5938     case 32:
5939       return AArch64_AM::UXTW;
5940     default:
5941       return AArch64_AM::InvalidShiftExtend;
5942     }
5943   }
5944 
5945   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
5946   // on the RHS.
5947   if (Opc != TargetOpcode::G_AND)
5948     return AArch64_AM::InvalidShiftExtend;
5949 
5950   Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
5951   if (!MaybeAndMask)
5952     return AArch64_AM::InvalidShiftExtend;
5953   uint64_t AndMask = *MaybeAndMask;
5954   switch (AndMask) {
5955   default:
5956     return AArch64_AM::InvalidShiftExtend;
5957   case 0xFF:
5958     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
5959   case 0xFFFF:
5960     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
5961   case 0xFFFFFFFF:
5962     return AArch64_AM::UXTW;
5963   }
5964 }
5965 
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const5966 Register AArch64InstructionSelector::moveScalarRegClass(
5967     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
5968   MachineRegisterInfo &MRI = *MIB.getMRI();
5969   auto Ty = MRI.getType(Reg);
5970   assert(!Ty.isVector() && "Expected scalars only!");
5971   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
5972     return Reg;
5973 
5974   // Create a copy and immediately select it.
5975   // FIXME: We should have an emitCopy function?
5976   auto Copy = MIB.buildCopy({&RC}, {Reg});
5977   selectCopy(*Copy, TII, MRI, TRI, RBI);
5978   return Copy.getReg(0);
5979 }
5980 
5981 /// Select an "extended register" operand. This operand folds in an extend
5982 /// followed by an optional left shift.
5983 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const5984 AArch64InstructionSelector::selectArithExtendedRegister(
5985     MachineOperand &Root) const {
5986   if (!Root.isReg())
5987     return None;
5988   MachineRegisterInfo &MRI =
5989       Root.getParent()->getParent()->getParent()->getRegInfo();
5990 
5991   uint64_t ShiftVal = 0;
5992   Register ExtReg;
5993   AArch64_AM::ShiftExtendType Ext;
5994   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
5995   if (!RootDef)
5996     return None;
5997 
5998   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
5999     return None;
6000 
6001   // Check if we can fold a shift and an extend.
6002   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6003     // Look for a constant on the RHS of the shift.
6004     MachineOperand &RHS = RootDef->getOperand(2);
6005     Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6006     if (!MaybeShiftVal)
6007       return None;
6008     ShiftVal = *MaybeShiftVal;
6009     if (ShiftVal > 4)
6010       return None;
6011     // Look for a valid extend instruction on the LHS of the shift.
6012     MachineOperand &LHS = RootDef->getOperand(1);
6013     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6014     if (!ExtDef)
6015       return None;
6016     Ext = getExtendTypeForInst(*ExtDef, MRI);
6017     if (Ext == AArch64_AM::InvalidShiftExtend)
6018       return None;
6019     ExtReg = ExtDef->getOperand(1).getReg();
6020   } else {
6021     // Didn't get a shift. Try just folding an extend.
6022     Ext = getExtendTypeForInst(*RootDef, MRI);
6023     if (Ext == AArch64_AM::InvalidShiftExtend)
6024       return None;
6025     ExtReg = RootDef->getOperand(1).getReg();
6026 
6027     // If we have a 32 bit instruction which zeroes out the high half of a
6028     // register, we get an implicit zero extend for free. Check if we have one.
6029     // FIXME: We actually emit the extend right now even though we don't have
6030     // to.
6031     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6032       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6033       if (ExtInst && isDef32(*ExtInst))
6034         return None;
6035     }
6036   }
6037 
6038   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6039   // copy.
6040   MachineIRBuilder MIB(*RootDef);
6041   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6042 
6043   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6044            [=](MachineInstrBuilder &MIB) {
6045              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6046            }}};
6047 }
6048 
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6049 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6050                                                 const MachineInstr &MI,
6051                                                 int OpIdx) const {
6052   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6053   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6054          "Expected G_CONSTANT");
6055   Optional<int64_t> CstVal =
6056       getConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6057   assert(CstVal && "Expected constant value");
6058   MIB.addImm(CstVal.getValue());
6059 }
6060 
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6061 void AArch64InstructionSelector::renderLogicalImm32(
6062   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6063   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6064          "Expected G_CONSTANT");
6065   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6066   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6067   MIB.addImm(Enc);
6068 }
6069 
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6070 void AArch64InstructionSelector::renderLogicalImm64(
6071   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6072   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6073          "Expected G_CONSTANT");
6074   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6075   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6076   MIB.addImm(Enc);
6077 }
6078 
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6079 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6080                                                const MachineInstr &MI,
6081                                                int OpIdx) const {
6082   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6083          "Expected G_FCONSTANT");
6084   MIB.addImm(
6085       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6086 }
6087 
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6088 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6089                                                const MachineInstr &MI,
6090                                                int OpIdx) const {
6091   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6092          "Expected G_FCONSTANT");
6093   MIB.addImm(
6094       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6095 }
6096 
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6097 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6098                                                const MachineInstr &MI,
6099                                                int OpIdx) const {
6100   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6101          "Expected G_FCONSTANT");
6102   MIB.addImm(
6103       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6104 }
6105 
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const6106 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6107     const MachineInstr &MI, unsigned NumBytes) const {
6108   if (!MI.mayLoadOrStore())
6109     return false;
6110   assert(MI.hasOneMemOperand() &&
6111          "Expected load/store to have only one mem op!");
6112   return (*MI.memoperands_begin())->getSize() == NumBytes;
6113 }
6114 
isDef32(const MachineInstr & MI) const6115 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6116   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6117   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6118     return false;
6119 
6120   // Only return true if we know the operation will zero-out the high half of
6121   // the 64-bit register. Truncates can be subregister copies, which don't
6122   // zero out the high bits. Copies and other copy-like instructions can be
6123   // fed by truncates, or could be lowered as subregister copies.
6124   switch (MI.getOpcode()) {
6125   default:
6126     return true;
6127   case TargetOpcode::COPY:
6128   case TargetOpcode::G_BITCAST:
6129   case TargetOpcode::G_TRUNC:
6130   case TargetOpcode::G_PHI:
6131     return false;
6132   }
6133 }
6134 
6135 
6136 // Perform fixups on the given PHI instruction's operands to force them all
6137 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)6138 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6139                             const AArch64RegisterBankInfo &RBI) {
6140   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6141   Register DstReg = MI.getOperand(0).getReg();
6142   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6143   assert(DstRB && "Expected PHI dst to have regbank assigned");
6144   MachineIRBuilder MIB(MI);
6145 
6146   // Go through each operand and ensure it has the same regbank.
6147   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
6148     MachineOperand &MO = MI.getOperand(OpIdx);
6149     if (!MO.isReg())
6150       continue;
6151     Register OpReg = MO.getReg();
6152     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6153     if (RB != DstRB) {
6154       // Insert a cross-bank copy.
6155       auto *OpDef = MRI.getVRegDef(OpReg);
6156       const LLT &Ty = MRI.getType(OpReg);
6157       MachineBasicBlock &OpDefBB = *OpDef->getParent();
6158 
6159       // Any instruction we insert must appear after all PHIs in the block
6160       // for the block to be valid MIR.
6161       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6162       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6163         InsertPt = OpDefBB.getFirstNonPHI();
6164       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6165       auto Copy = MIB.buildCopy(Ty, OpReg);
6166       MRI.setRegBank(Copy.getReg(0), *DstRB);
6167       MO.setReg(Copy.getReg(0));
6168     }
6169   }
6170 }
6171 
processPHIs(MachineFunction & MF)6172 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6173   // We're looking for PHIs, build a list so we don't invalidate iterators.
6174   MachineRegisterInfo &MRI = MF.getRegInfo();
6175   SmallVector<MachineInstr *, 32> Phis;
6176   for (auto &BB : MF) {
6177     for (auto &MI : BB) {
6178       if (MI.getOpcode() == TargetOpcode::G_PHI)
6179         Phis.emplace_back(&MI);
6180     }
6181   }
6182 
6183   for (auto *MI : Phis) {
6184     // We need to do some work here if the operand types are < 16 bit and they
6185     // are split across fpr/gpr banks. Since all types <32b on gpr
6186     // end up being assigned gpr32 regclasses, we can end up with PHIs here
6187     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6188     // be selecting heterogenous regbanks for operands if possible, but we
6189     // still need to be able to deal with it here.
6190     //
6191     // To fix this, if we have a gpr-bank operand < 32b in size and at least
6192     // one other operand is on the fpr bank, then we add cross-bank copies
6193     // to homogenize the operand banks. For simplicity the bank that we choose
6194     // to settle on is whatever bank the def operand has. For example:
6195     //
6196     // %endbb:
6197     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6198     //  =>
6199     // %bb2:
6200     //   ...
6201     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6202     //   ...
6203     // %endbb:
6204     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6205     bool HasGPROp = false, HasFPROp = false;
6206     for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
6207       const auto &MO = MI->getOperand(OpIdx);
6208       if (!MO.isReg())
6209         continue;
6210       const LLT &Ty = MRI.getType(MO.getReg());
6211       if (!Ty.isValid() || !Ty.isScalar())
6212         break;
6213       if (Ty.getSizeInBits() >= 32)
6214         break;
6215       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6216       // If for some reason we don't have a regbank yet. Don't try anything.
6217       if (!RB)
6218         break;
6219 
6220       if (RB->getID() == AArch64::GPRRegBankID)
6221         HasGPROp = true;
6222       else
6223         HasFPROp = true;
6224     }
6225     // We have heterogenous regbanks, need to fixup.
6226     if (HasGPROp && HasFPROp)
6227       fixupPHIOpBanks(*MI, MRI, RBI);
6228   }
6229 }
6230 
6231 namespace llvm {
6232 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,AArch64Subtarget & Subtarget,AArch64RegisterBankInfo & RBI)6233 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6234                                  AArch64Subtarget &Subtarget,
6235                                  AArch64RegisterBankInfo &RBI) {
6236   return new AArch64InstructionSelector(TM, Subtarget, RBI);
6237 }
6238 }
6239