1 //===- AArch64InstructionSelector.cpp ----------------------------*- C++ -*-==//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 /// \file
9 /// This file implements the targeting of the InstructionSelector class for
10 /// AArch64.
11 /// \todo This should be generated by TableGen.
12 //===----------------------------------------------------------------------===//
13 
14 #include "AArch64GlobalISelUtils.h"
15 #include "AArch64InstrInfo.h"
16 #include "AArch64MachineFunctionInfo.h"
17 #include "AArch64RegisterBankInfo.h"
18 #include "AArch64RegisterInfo.h"
19 #include "AArch64Subtarget.h"
20 #include "AArch64TargetMachine.h"
21 #include "AArch64GlobalISelUtils.h"
22 #include "MCTargetDesc/AArch64AddressingModes.h"
23 #include "MCTargetDesc/AArch64MCTargetDesc.h"
24 #include "llvm/ADT/Optional.h"
25 #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
26 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
27 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
28 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
29 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
30 #include "llvm/CodeGen/MachineBasicBlock.h"
31 #include "llvm/CodeGen/MachineConstantPool.h"
32 #include "llvm/CodeGen/MachineFunction.h"
33 #include "llvm/CodeGen/MachineInstr.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineMemOperand.h"
36 #include "llvm/CodeGen/MachineOperand.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/TargetOpcodes.h"
39 #include "llvm/IR/Constants.h"
40 #include "llvm/IR/DerivedTypes.h"
41 #include "llvm/IR/Instructions.h"
42 #include "llvm/IR/PatternMatch.h"
43 #include "llvm/IR/Type.h"
44 #include "llvm/IR/IntrinsicsAArch64.h"
45 #include "llvm/Pass.h"
46 #include "llvm/Support/Debug.h"
47 #include "llvm/Support/raw_ostream.h"
48 
49 #define DEBUG_TYPE "aarch64-isel"
50 
51 using namespace llvm;
52 using namespace MIPatternMatch;
53 using namespace AArch64GISelUtils;
54 
55 namespace llvm {
56 class BlockFrequencyInfo;
57 class ProfileSummaryInfo;
58 }
59 
60 namespace {
61 
62 #define GET_GLOBALISEL_PREDICATE_BITSET
63 #include "AArch64GenGlobalISel.inc"
64 #undef GET_GLOBALISEL_PREDICATE_BITSET
65 
66 class AArch64InstructionSelector : public InstructionSelector {
67 public:
68   AArch64InstructionSelector(const AArch64TargetMachine &TM,
69                              const AArch64Subtarget &STI,
70                              const AArch64RegisterBankInfo &RBI);
71 
72   bool select(MachineInstr &I) override;
getName()73   static const char *getName() { return DEBUG_TYPE; }
74 
setupMF(MachineFunction & MF,GISelKnownBits * KB,CodeGenCoverage & CoverageInfo,ProfileSummaryInfo * PSI,BlockFrequencyInfo * BFI)75   void setupMF(MachineFunction &MF, GISelKnownBits *KB,
76                CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
77                BlockFrequencyInfo *BFI) override {
78     InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
79     MIB.setMF(MF);
80 
81     // hasFnAttribute() is expensive to call on every BRCOND selection, so
82     // cache it here for each run of the selector.
83     ProduceNonFlagSettingCondBr =
84         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
85     MFReturnAddr = Register();
86 
87     processPHIs(MF);
88   }
89 
90 private:
91   /// tblgen-erated 'select' implementation, used as the initial selector for
92   /// the patterns that don't require complex C++.
93   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
94 
95   // A lowering phase that runs before any selection attempts.
96   // Returns true if the instruction was modified.
97   bool preISelLower(MachineInstr &I);
98 
99   // An early selection function that runs before the selectImpl() call.
100   bool earlySelect(MachineInstr &I);
101 
102   // Do some preprocessing of G_PHIs before we begin selection.
103   void processPHIs(MachineFunction &MF);
104 
105   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI);
106 
107   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
108   bool contractCrossBankCopyIntoStore(MachineInstr &I,
109                                       MachineRegisterInfo &MRI);
110 
111   bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
112 
113   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
114                           MachineRegisterInfo &MRI) const;
115   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
116                            MachineRegisterInfo &MRI) const;
117 
118   ///@{
119   /// Helper functions for selectCompareBranch.
120   bool selectCompareBranchFedByFCmp(MachineInstr &I, MachineInstr &FCmp,
121                                     MachineIRBuilder &MIB) const;
122   bool selectCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
123                                     MachineIRBuilder &MIB) const;
124   bool tryOptCompareBranchFedByICmp(MachineInstr &I, MachineInstr &ICmp,
125                                     MachineIRBuilder &MIB) const;
126   bool tryOptAndIntoCompareBranch(MachineInstr &AndInst, bool Invert,
127                                   MachineBasicBlock *DstMBB,
128                                   MachineIRBuilder &MIB) const;
129   ///@}
130 
131   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
132                            MachineRegisterInfo &MRI);
133 
134   bool selectVectorAshrLshr(MachineInstr &I, MachineRegisterInfo &MRI);
135   bool selectVectorSHL(MachineInstr &I, MachineRegisterInfo &MRI);
136 
137   // Helper to generate an equivalent of scalar_to_vector into a new register,
138   // returned via 'Dst'.
139   MachineInstr *emitScalarToVector(unsigned EltSize,
140                                    const TargetRegisterClass *DstRC,
141                                    Register Scalar,
142                                    MachineIRBuilder &MIRBuilder) const;
143 
144   /// Emit a lane insert into \p DstReg, or a new vector register if None is
145   /// provided.
146   ///
147   /// The lane inserted into is defined by \p LaneIdx. The vector source
148   /// register is given by \p SrcReg. The register containing the element is
149   /// given by \p EltReg.
150   MachineInstr *emitLaneInsert(Optional<Register> DstReg, Register SrcReg,
151                                Register EltReg, unsigned LaneIdx,
152                                const RegisterBank &RB,
153                                MachineIRBuilder &MIRBuilder) const;
154 
155   /// Emit a sequence of instructions representing a constant \p CV for a
156   /// vector register \p Dst. (E.g. a MOV, or a load from a constant pool.)
157   ///
158   /// \returns the last instruction in the sequence on success, and nullptr
159   /// otherwise.
160   MachineInstr *emitConstantVector(Register Dst, Constant *CV,
161                                    MachineIRBuilder &MIRBuilder,
162                                    MachineRegisterInfo &MRI);
163 
164   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI);
165   bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
166                               MachineRegisterInfo &MRI);
167   /// \returns true if a G_BUILD_VECTOR instruction \p MI can be selected as a
168   /// SUBREG_TO_REG.
169   bool tryOptBuildVecToSubregToReg(MachineInstr &MI, MachineRegisterInfo &MRI);
170   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI);
171   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
172   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI);
173 
174   bool selectShuffleVector(MachineInstr &I, MachineRegisterInfo &MRI);
175   bool selectExtractElt(MachineInstr &I, MachineRegisterInfo &MRI);
176   bool selectConcatVectors(MachineInstr &I, MachineRegisterInfo &MRI);
177   bool selectSplitVectorUnmerge(MachineInstr &I, MachineRegisterInfo &MRI);
178 
179   /// Helper function to select vector load intrinsics like
180   /// @llvm.aarch64.neon.ld2.*, @llvm.aarch64.neon.ld4.*, etc.
181   /// \p Opc is the opcode that the selected instruction should use.
182   /// \p NumVecs is the number of vector destinations for the instruction.
183   /// \p I is the original G_INTRINSIC_W_SIDE_EFFECTS instruction.
184   bool selectVectorLoadIntrinsic(unsigned Opc, unsigned NumVecs,
185                                  MachineInstr &I);
186   bool selectIntrinsicWithSideEffects(MachineInstr &I,
187                                       MachineRegisterInfo &MRI);
188   bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
189   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI);
190   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
191   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
192   bool selectJumpTable(MachineInstr &I, MachineRegisterInfo &MRI);
193   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI);
194   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI);
195   bool selectReduction(MachineInstr &I, MachineRegisterInfo &MRI);
196   bool selectUSMovFromExtend(MachineInstr &I, MachineRegisterInfo &MRI);
197 
198   unsigned emitConstantPoolEntry(const Constant *CPVal,
199                                  MachineFunction &MF) const;
200   MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
201                                          MachineIRBuilder &MIRBuilder) const;
202 
203   // Emit a vector concat operation.
204   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
205                                  Register Op2,
206                                  MachineIRBuilder &MIRBuilder) const;
207 
208   // Emit an integer compare between LHS and RHS, which checks for Predicate.
209   MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
210                                    MachineOperand &Predicate,
211                                    MachineIRBuilder &MIRBuilder) const;
212 
213   /// Emit a floating point comparison between \p LHS and \p RHS.
214   /// \p Pred if given is the intended predicate to use.
215   MachineInstr *emitFPCompare(Register LHS, Register RHS,
216                               MachineIRBuilder &MIRBuilder,
217                               Optional<CmpInst::Predicate> = None) const;
218 
219   MachineInstr *emitInstr(unsigned Opcode,
220                           std::initializer_list<llvm::DstOp> DstOps,
221                           std::initializer_list<llvm::SrcOp> SrcOps,
222                           MachineIRBuilder &MIRBuilder,
223                           const ComplexRendererFns &RenderFns = None) const;
224   /// Helper function to emit an add or sub instruction.
225   ///
226   /// \p AddrModeAndSizeToOpcode must contain each of the opcode variants above
227   /// in a specific order.
228   ///
229   /// Below is an example of the expected input to \p AddrModeAndSizeToOpcode.
230   ///
231   /// \code
232   ///   const std::array<std::array<unsigned, 2>, 4> Table {
233   ///    {{AArch64::ADDXri, AArch64::ADDWri},
234   ///     {AArch64::ADDXrs, AArch64::ADDWrs},
235   ///     {AArch64::ADDXrr, AArch64::ADDWrr},
236   ///     {AArch64::SUBXri, AArch64::SUBWri},
237   ///     {AArch64::ADDXrx, AArch64::ADDWrx}}};
238   /// \endcode
239   ///
240   /// Each row in the table corresponds to a different addressing mode. Each
241   /// column corresponds to a different register size.
242   ///
243   /// \attention Rows must be structured as follows:
244   ///   - Row 0: The ri opcode variants
245   ///   - Row 1: The rs opcode variants
246   ///   - Row 2: The rr opcode variants
247   ///   - Row 3: The ri opcode variants for negative immediates
248   ///   - Row 4: The rx opcode variants
249   ///
250   /// \attention Columns must be structured as follows:
251   ///   - Column 0: The 64-bit opcode variants
252   ///   - Column 1: The 32-bit opcode variants
253   ///
254   /// \p Dst is the destination register of the binop to emit.
255   /// \p LHS is the left-hand operand of the binop to emit.
256   /// \p RHS is the right-hand operand of the binop to emit.
257   MachineInstr *emitAddSub(
258       const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
259       Register Dst, MachineOperand &LHS, MachineOperand &RHS,
260       MachineIRBuilder &MIRBuilder) const;
261   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS,
262                         MachineOperand &RHS,
263                         MachineIRBuilder &MIRBuilder) const;
264   MachineInstr *emitADDS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
265                          MachineIRBuilder &MIRBuilder) const;
266   MachineInstr *emitSUBS(Register Dst, MachineOperand &LHS, MachineOperand &RHS,
267                          MachineIRBuilder &MIRBuilder) const;
268   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
269                         MachineIRBuilder &MIRBuilder) const;
270   MachineInstr *emitTST(MachineOperand &LHS, MachineOperand &RHS,
271                         MachineIRBuilder &MIRBuilder) const;
272   MachineInstr *emitSelect(Register Dst, Register LHS, Register RHS,
273                            AArch64CC::CondCode CC,
274                            MachineIRBuilder &MIRBuilder) const;
275   MachineInstr *emitExtractVectorElt(Optional<Register> DstReg,
276                                      const RegisterBank &DstRB, LLT ScalarTy,
277                                      Register VecReg, unsigned LaneIdx,
278                                      MachineIRBuilder &MIRBuilder) const;
279 
280   /// Emit a CSet for an integer compare.
281   ///
282   /// \p DefReg and \p SrcReg are expected to be 32-bit scalar registers.
283   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
284                                 MachineIRBuilder &MIRBuilder,
285                                 Register SrcReg = AArch64::WZR) const;
286   /// Emit a CSet for a FP compare.
287   ///
288   /// \p Dst is expected to be a 32-bit scalar register.
289   MachineInstr *emitCSetForFCmp(Register Dst, CmpInst::Predicate Pred,
290                                 MachineIRBuilder &MIRBuilder) const;
291 
292   /// Emit the overflow op for \p Opcode.
293   ///
294   /// \p Opcode is expected to be an overflow op's opcode, e.g. G_UADDO,
295   /// G_USUBO, etc.
296   std::pair<MachineInstr *, AArch64CC::CondCode>
297   emitOverflowOp(unsigned Opcode, Register Dst, MachineOperand &LHS,
298                  MachineOperand &RHS, MachineIRBuilder &MIRBuilder) const;
299 
300   /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
301   /// \p IsNegative is true if the test should be "not zero".
302   /// This will also optimize the test bit instruction when possible.
303   MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
304                             MachineBasicBlock *DstMBB,
305                             MachineIRBuilder &MIB) const;
306 
307   /// Emit a CB(N)Z instruction which branches to \p DestMBB.
308   MachineInstr *emitCBZ(Register CompareReg, bool IsNegative,
309                         MachineBasicBlock *DestMBB,
310                         MachineIRBuilder &MIB) const;
311 
312   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
313   // We use these manually instead of using the importer since it doesn't
314   // support SDNodeXForm.
315   ComplexRendererFns selectShiftA_32(const MachineOperand &Root) const;
316   ComplexRendererFns selectShiftB_32(const MachineOperand &Root) const;
317   ComplexRendererFns selectShiftA_64(const MachineOperand &Root) const;
318   ComplexRendererFns selectShiftB_64(const MachineOperand &Root) const;
319 
320   ComplexRendererFns select12BitValueWithLeftShift(uint64_t Immed) const;
321   ComplexRendererFns selectArithImmed(MachineOperand &Root) const;
322   ComplexRendererFns selectNegArithImmed(MachineOperand &Root) const;
323 
324   ComplexRendererFns selectAddrModeUnscaled(MachineOperand &Root,
325                                             unsigned Size) const;
326 
selectAddrModeUnscaled8(MachineOperand & Root) const327   ComplexRendererFns selectAddrModeUnscaled8(MachineOperand &Root) const {
328     return selectAddrModeUnscaled(Root, 1);
329   }
selectAddrModeUnscaled16(MachineOperand & Root) const330   ComplexRendererFns selectAddrModeUnscaled16(MachineOperand &Root) const {
331     return selectAddrModeUnscaled(Root, 2);
332   }
selectAddrModeUnscaled32(MachineOperand & Root) const333   ComplexRendererFns selectAddrModeUnscaled32(MachineOperand &Root) const {
334     return selectAddrModeUnscaled(Root, 4);
335   }
selectAddrModeUnscaled64(MachineOperand & Root) const336   ComplexRendererFns selectAddrModeUnscaled64(MachineOperand &Root) const {
337     return selectAddrModeUnscaled(Root, 8);
338   }
selectAddrModeUnscaled128(MachineOperand & Root) const339   ComplexRendererFns selectAddrModeUnscaled128(MachineOperand &Root) const {
340     return selectAddrModeUnscaled(Root, 16);
341   }
342 
343   /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
344   /// from complex pattern matchers like selectAddrModeIndexed().
345   ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
346                                           MachineRegisterInfo &MRI) const;
347 
348   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
349                                            unsigned Size) const;
350   template <int Width>
selectAddrModeIndexed(MachineOperand & Root) const351   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root) const {
352     return selectAddrModeIndexed(Root, Width / 8);
353   }
354 
355   bool isWorthFoldingIntoExtendedReg(MachineInstr &MI,
356                                      const MachineRegisterInfo &MRI) const;
357   ComplexRendererFns
358   selectAddrModeShiftedExtendXReg(MachineOperand &Root,
359                                   unsigned SizeInBytes) const;
360 
361   /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
362   /// or not a shift + extend should be folded into an addressing mode. Returns
363   /// None when this is not profitable or possible.
364   ComplexRendererFns
365   selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
366                     MachineOperand &Offset, unsigned SizeInBytes,
367                     bool WantsExt) const;
368   ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
369   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
370                                        unsigned SizeInBytes) const;
371   template <int Width>
selectAddrModeXRO(MachineOperand & Root) const372   ComplexRendererFns selectAddrModeXRO(MachineOperand &Root) const {
373     return selectAddrModeXRO(Root, Width / 8);
374   }
375 
376   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
377                                        unsigned SizeInBytes) const;
378   template <int Width>
selectAddrModeWRO(MachineOperand & Root) const379   ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
380     return selectAddrModeWRO(Root, Width / 8);
381   }
382 
383   ComplexRendererFns selectShiftedRegister(MachineOperand &Root,
384                                            bool AllowROR = false) const;
385 
selectArithShiftedRegister(MachineOperand & Root) const386   ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
387     return selectShiftedRegister(Root);
388   }
389 
selectLogicalShiftedRegister(MachineOperand & Root) const390   ComplexRendererFns selectLogicalShiftedRegister(MachineOperand &Root) const {
391     return selectShiftedRegister(Root, true);
392   }
393 
394   /// Given an extend instruction, determine the correct shift-extend type for
395   /// that instruction.
396   ///
397   /// If the instruction is going to be used in a load or store, pass
398   /// \p IsLoadStore = true.
399   AArch64_AM::ShiftExtendType
400   getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
401                        bool IsLoadStore = false) const;
402 
403   /// Move \p Reg to \p RC if \p Reg is not already on \p RC.
404   ///
405   /// \returns Either \p Reg if no change was necessary, or the new register
406   /// created by moving \p Reg.
407   ///
408   /// Note: This uses emitCopy right now.
409   Register moveScalarRegClass(Register Reg, const TargetRegisterClass &RC,
410                               MachineIRBuilder &MIB) const;
411 
412   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
413 
414   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
415                       int OpIdx = -1) const;
416   void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
417                           int OpIdx = -1) const;
418   void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
419                           int OpIdx = -1) const;
420   void renderFPImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
421                      int OpIdx = -1) const;
422   void renderFPImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
423                      int OpIdx = -1) const;
424   void renderFPImm64(MachineInstrBuilder &MIB, const MachineInstr &MI,
425                      int OpIdx = -1) const;
426 
427   // Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
428   void materializeLargeCMVal(MachineInstr &I, const Value *V, unsigned OpFlags);
429 
430   // Optimization methods.
431   bool tryOptSelect(MachineInstr &MI);
432   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
433                                       MachineOperand &Predicate,
434                                       MachineIRBuilder &MIRBuilder) const;
435 
436   /// Return true if \p MI is a load or store of \p NumBytes bytes.
437   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
438 
439   /// Returns true if \p MI is guaranteed to have the high-half of a 64-bit
440   /// register zeroed out. In other words, the result of MI has been explicitly
441   /// zero extended.
442   bool isDef32(const MachineInstr &MI) const;
443 
444   const AArch64TargetMachine &TM;
445   const AArch64Subtarget &STI;
446   const AArch64InstrInfo &TII;
447   const AArch64RegisterInfo &TRI;
448   const AArch64RegisterBankInfo &RBI;
449 
450   bool ProduceNonFlagSettingCondBr = false;
451 
452   // Some cached values used during selection.
453   // We use LR as a live-in register, and we keep track of it here as it can be
454   // clobbered by calls.
455   Register MFReturnAddr;
456 
457   MachineIRBuilder MIB;
458 
459 #define GET_GLOBALISEL_PREDICATES_DECL
460 #include "AArch64GenGlobalISel.inc"
461 #undef GET_GLOBALISEL_PREDICATES_DECL
462 
463 // We declare the temporaries used by selectImpl() in the class to minimize the
464 // cost of constructing placeholder values.
465 #define GET_GLOBALISEL_TEMPORARIES_DECL
466 #include "AArch64GenGlobalISel.inc"
467 #undef GET_GLOBALISEL_TEMPORARIES_DECL
468 };
469 
470 } // end anonymous namespace
471 
472 #define GET_GLOBALISEL_IMPL
473 #include "AArch64GenGlobalISel.inc"
474 #undef GET_GLOBALISEL_IMPL
475 
AArch64InstructionSelector(const AArch64TargetMachine & TM,const AArch64Subtarget & STI,const AArch64RegisterBankInfo & RBI)476 AArch64InstructionSelector::AArch64InstructionSelector(
477     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
478     const AArch64RegisterBankInfo &RBI)
479     : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
480       TRI(*STI.getRegisterInfo()), RBI(RBI),
481 #define GET_GLOBALISEL_PREDICATES_INIT
482 #include "AArch64GenGlobalISel.inc"
483 #undef GET_GLOBALISEL_PREDICATES_INIT
484 #define GET_GLOBALISEL_TEMPORARIES_INIT
485 #include "AArch64GenGlobalISel.inc"
486 #undef GET_GLOBALISEL_TEMPORARIES_INIT
487 {
488 }
489 
490 // FIXME: This should be target-independent, inferred from the types declared
491 // for each class in the bank.
492 static const TargetRegisterClass *
getRegClassForTypeOnBank(LLT Ty,const RegisterBank & RB,const RegisterBankInfo & RBI,bool GetAllRegSet=false)493 getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
494                          const RegisterBankInfo &RBI,
495                          bool GetAllRegSet = false) {
496   if (RB.getID() == AArch64::GPRRegBankID) {
497     if (Ty.getSizeInBits() <= 32)
498       return GetAllRegSet ? &AArch64::GPR32allRegClass
499                           : &AArch64::GPR32RegClass;
500     if (Ty.getSizeInBits() == 64)
501       return GetAllRegSet ? &AArch64::GPR64allRegClass
502                           : &AArch64::GPR64RegClass;
503     if (Ty.getSizeInBits() == 128)
504       return &AArch64::XSeqPairsClassRegClass;
505     return nullptr;
506   }
507 
508   if (RB.getID() == AArch64::FPRRegBankID) {
509     switch (Ty.getSizeInBits()) {
510     case 8:
511       return &AArch64::FPR8RegClass;
512     case 16:
513       return &AArch64::FPR16RegClass;
514     case 32:
515       return &AArch64::FPR32RegClass;
516     case 64:
517       return &AArch64::FPR64RegClass;
518     case 128:
519       return &AArch64::FPR128RegClass;
520     }
521     return nullptr;
522   }
523 
524   return nullptr;
525 }
526 
527 /// Given a register bank, and size in bits, return the smallest register class
528 /// that can represent that combination.
529 static const TargetRegisterClass *
getMinClassForRegBank(const RegisterBank & RB,unsigned SizeInBits,bool GetAllRegSet=false)530 getMinClassForRegBank(const RegisterBank &RB, unsigned SizeInBits,
531                       bool GetAllRegSet = false) {
532   unsigned RegBankID = RB.getID();
533 
534   if (RegBankID == AArch64::GPRRegBankID) {
535     if (SizeInBits <= 32)
536       return GetAllRegSet ? &AArch64::GPR32allRegClass
537                           : &AArch64::GPR32RegClass;
538     if (SizeInBits == 64)
539       return GetAllRegSet ? &AArch64::GPR64allRegClass
540                           : &AArch64::GPR64RegClass;
541     if (SizeInBits == 128)
542       return &AArch64::XSeqPairsClassRegClass;
543   }
544 
545   if (RegBankID == AArch64::FPRRegBankID) {
546     switch (SizeInBits) {
547     default:
548       return nullptr;
549     case 8:
550       return &AArch64::FPR8RegClass;
551     case 16:
552       return &AArch64::FPR16RegClass;
553     case 32:
554       return &AArch64::FPR32RegClass;
555     case 64:
556       return &AArch64::FPR64RegClass;
557     case 128:
558       return &AArch64::FPR128RegClass;
559     }
560   }
561 
562   return nullptr;
563 }
564 
565 /// Returns the correct subregister to use for a given register class.
getSubRegForClass(const TargetRegisterClass * RC,const TargetRegisterInfo & TRI,unsigned & SubReg)566 static bool getSubRegForClass(const TargetRegisterClass *RC,
567                               const TargetRegisterInfo &TRI, unsigned &SubReg) {
568   switch (TRI.getRegSizeInBits(*RC)) {
569   case 8:
570     SubReg = AArch64::bsub;
571     break;
572   case 16:
573     SubReg = AArch64::hsub;
574     break;
575   case 32:
576     if (RC != &AArch64::FPR32RegClass)
577       SubReg = AArch64::sub_32;
578     else
579       SubReg = AArch64::ssub;
580     break;
581   case 64:
582     SubReg = AArch64::dsub;
583     break;
584   default:
585     LLVM_DEBUG(
586         dbgs() << "Couldn't find appropriate subregister for register class.");
587     return false;
588   }
589 
590   return true;
591 }
592 
593 /// Returns the minimum size the given register bank can hold.
getMinSizeForRegBank(const RegisterBank & RB)594 static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
595   switch (RB.getID()) {
596   case AArch64::GPRRegBankID:
597     return 32;
598   case AArch64::FPRRegBankID:
599     return 8;
600   default:
601     llvm_unreachable("Tried to get minimum size for unknown register bank.");
602   }
603 }
604 
605 /// Create a REG_SEQUENCE instruction using the registers in \p Regs.
606 /// Helper function for functions like createDTuple and createQTuple.
607 ///
608 /// \p RegClassIDs - The list of register class IDs available for some tuple of
609 /// a scalar class. E.g. QQRegClassID, QQQRegClassID, QQQQRegClassID. This is
610 /// expected to contain between 2 and 4 tuple classes.
611 ///
612 /// \p SubRegs - The list of subregister classes associated with each register
613 /// class ID in \p RegClassIDs. E.g., QQRegClassID should use the qsub0
614 /// subregister class. The index of each subregister class is expected to
615 /// correspond with the index of each register class.
616 ///
617 /// \returns Either the destination register of REG_SEQUENCE instruction that
618 /// was created, or the 0th element of \p Regs if \p Regs contains a single
619 /// element.
createTuple(ArrayRef<Register> Regs,const unsigned RegClassIDs[],const unsigned SubRegs[],MachineIRBuilder & MIB)620 static Register createTuple(ArrayRef<Register> Regs,
621                             const unsigned RegClassIDs[],
622                             const unsigned SubRegs[], MachineIRBuilder &MIB) {
623   unsigned NumRegs = Regs.size();
624   if (NumRegs == 1)
625     return Regs[0];
626   assert(NumRegs >= 2 && NumRegs <= 4 &&
627          "Only support between two and 4 registers in a tuple!");
628   const TargetRegisterInfo *TRI = MIB.getMF().getSubtarget().getRegisterInfo();
629   auto *DesiredClass = TRI->getRegClass(RegClassIDs[NumRegs - 2]);
630   auto RegSequence =
631       MIB.buildInstr(TargetOpcode::REG_SEQUENCE, {DesiredClass}, {});
632   for (unsigned I = 0, E = Regs.size(); I < E; ++I) {
633     RegSequence.addUse(Regs[I]);
634     RegSequence.addImm(SubRegs[I]);
635   }
636   return RegSequence.getReg(0);
637 }
638 
639 /// Create a tuple of D-registers using the registers in \p Regs.
createDTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)640 static Register createDTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
641   static const unsigned RegClassIDs[] = {
642       AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
643   static const unsigned SubRegs[] = {AArch64::dsub0, AArch64::dsub1,
644                                      AArch64::dsub2, AArch64::dsub3};
645   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
646 }
647 
648 /// Create a tuple of Q-registers using the registers in \p Regs.
createQTuple(ArrayRef<Register> Regs,MachineIRBuilder & MIB)649 static Register createQTuple(ArrayRef<Register> Regs, MachineIRBuilder &MIB) {
650   static const unsigned RegClassIDs[] = {
651       AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
652   static const unsigned SubRegs[] = {AArch64::qsub0, AArch64::qsub1,
653                                      AArch64::qsub2, AArch64::qsub3};
654   return createTuple(Regs, RegClassIDs, SubRegs, MIB);
655 }
656 
getImmedFromMO(const MachineOperand & Root)657 static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
658   auto &MI = *Root.getParent();
659   auto &MBB = *MI.getParent();
660   auto &MF = *MBB.getParent();
661   auto &MRI = MF.getRegInfo();
662   uint64_t Immed;
663   if (Root.isImm())
664     Immed = Root.getImm();
665   else if (Root.isCImm())
666     Immed = Root.getCImm()->getZExtValue();
667   else if (Root.isReg()) {
668     auto ValAndVReg =
669         getIConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
670     if (!ValAndVReg)
671       return None;
672     Immed = ValAndVReg->Value.getSExtValue();
673   } else
674     return None;
675   return Immed;
676 }
677 
678 /// Check whether \p I is a currently unsupported binary operation:
679 /// - it has an unsized type
680 /// - an operand is not a vreg
681 /// - all operands are not in the same bank
682 /// These are checks that should someday live in the verifier, but right now,
683 /// these are mostly limitations of the aarch64 selector.
unsupportedBinOp(const MachineInstr & I,const AArch64RegisterBankInfo & RBI,const MachineRegisterInfo & MRI,const AArch64RegisterInfo & TRI)684 static bool unsupportedBinOp(const MachineInstr &I,
685                              const AArch64RegisterBankInfo &RBI,
686                              const MachineRegisterInfo &MRI,
687                              const AArch64RegisterInfo &TRI) {
688   LLT Ty = MRI.getType(I.getOperand(0).getReg());
689   if (!Ty.isValid()) {
690     LLVM_DEBUG(dbgs() << "Generic binop register should be typed\n");
691     return true;
692   }
693 
694   const RegisterBank *PrevOpBank = nullptr;
695   for (auto &MO : I.operands()) {
696     // FIXME: Support non-register operands.
697     if (!MO.isReg()) {
698       LLVM_DEBUG(dbgs() << "Generic inst non-reg operands are unsupported\n");
699       return true;
700     }
701 
702     // FIXME: Can generic operations have physical registers operands? If
703     // so, this will need to be taught about that, and we'll need to get the
704     // bank out of the minimal class for the register.
705     // Either way, this needs to be documented (and possibly verified).
706     if (!Register::isVirtualRegister(MO.getReg())) {
707       LLVM_DEBUG(dbgs() << "Generic inst has physical register operand\n");
708       return true;
709     }
710 
711     const RegisterBank *OpBank = RBI.getRegBank(MO.getReg(), MRI, TRI);
712     if (!OpBank) {
713       LLVM_DEBUG(dbgs() << "Generic register has no bank or class\n");
714       return true;
715     }
716 
717     if (PrevOpBank && OpBank != PrevOpBank) {
718       LLVM_DEBUG(dbgs() << "Generic inst operands have different banks\n");
719       return true;
720     }
721     PrevOpBank = OpBank;
722   }
723   return false;
724 }
725 
726 /// Select the AArch64 opcode for the basic binary operation \p GenericOpc
727 /// (such as G_OR or G_SDIV), appropriate for the register bank \p RegBankID
728 /// and of size \p OpSize.
729 /// \returns \p GenericOpc if the combination is unsupported.
selectBinaryOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)730 static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
731                                unsigned OpSize) {
732   switch (RegBankID) {
733   case AArch64::GPRRegBankID:
734     if (OpSize == 32) {
735       switch (GenericOpc) {
736       case TargetOpcode::G_SHL:
737         return AArch64::LSLVWr;
738       case TargetOpcode::G_LSHR:
739         return AArch64::LSRVWr;
740       case TargetOpcode::G_ASHR:
741         return AArch64::ASRVWr;
742       default:
743         return GenericOpc;
744       }
745     } else if (OpSize == 64) {
746       switch (GenericOpc) {
747       case TargetOpcode::G_PTR_ADD:
748         return AArch64::ADDXrr;
749       case TargetOpcode::G_SHL:
750         return AArch64::LSLVXr;
751       case TargetOpcode::G_LSHR:
752         return AArch64::LSRVXr;
753       case TargetOpcode::G_ASHR:
754         return AArch64::ASRVXr;
755       default:
756         return GenericOpc;
757       }
758     }
759     break;
760   case AArch64::FPRRegBankID:
761     switch (OpSize) {
762     case 32:
763       switch (GenericOpc) {
764       case TargetOpcode::G_FADD:
765         return AArch64::FADDSrr;
766       case TargetOpcode::G_FSUB:
767         return AArch64::FSUBSrr;
768       case TargetOpcode::G_FMUL:
769         return AArch64::FMULSrr;
770       case TargetOpcode::G_FDIV:
771         return AArch64::FDIVSrr;
772       default:
773         return GenericOpc;
774       }
775     case 64:
776       switch (GenericOpc) {
777       case TargetOpcode::G_FADD:
778         return AArch64::FADDDrr;
779       case TargetOpcode::G_FSUB:
780         return AArch64::FSUBDrr;
781       case TargetOpcode::G_FMUL:
782         return AArch64::FMULDrr;
783       case TargetOpcode::G_FDIV:
784         return AArch64::FDIVDrr;
785       case TargetOpcode::G_OR:
786         return AArch64::ORRv8i8;
787       default:
788         return GenericOpc;
789       }
790     }
791     break;
792   }
793   return GenericOpc;
794 }
795 
796 /// Select the AArch64 opcode for the G_LOAD or G_STORE operation \p GenericOpc,
797 /// appropriate for the (value) register bank \p RegBankID and of memory access
798 /// size \p OpSize.  This returns the variant with the base+unsigned-immediate
799 /// addressing mode (e.g., LDRXui).
800 /// \returns \p GenericOpc if the combination is unsupported.
selectLoadStoreUIOp(unsigned GenericOpc,unsigned RegBankID,unsigned OpSize)801 static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
802                                     unsigned OpSize) {
803   const bool isStore = GenericOpc == TargetOpcode::G_STORE;
804   switch (RegBankID) {
805   case AArch64::GPRRegBankID:
806     switch (OpSize) {
807     case 8:
808       return isStore ? AArch64::STRBBui : AArch64::LDRBBui;
809     case 16:
810       return isStore ? AArch64::STRHHui : AArch64::LDRHHui;
811     case 32:
812       return isStore ? AArch64::STRWui : AArch64::LDRWui;
813     case 64:
814       return isStore ? AArch64::STRXui : AArch64::LDRXui;
815     }
816     break;
817   case AArch64::FPRRegBankID:
818     switch (OpSize) {
819     case 8:
820       return isStore ? AArch64::STRBui : AArch64::LDRBui;
821     case 16:
822       return isStore ? AArch64::STRHui : AArch64::LDRHui;
823     case 32:
824       return isStore ? AArch64::STRSui : AArch64::LDRSui;
825     case 64:
826       return isStore ? AArch64::STRDui : AArch64::LDRDui;
827     case 128:
828       return isStore ? AArch64::STRQui : AArch64::LDRQui;
829     }
830     break;
831   }
832   return GenericOpc;
833 }
834 
835 #ifndef NDEBUG
836 /// Helper function that verifies that we have a valid copy at the end of
837 /// selectCopy. Verifies that the source and dest have the expected sizes and
838 /// then returns true.
isValidCopy(const MachineInstr & I,const RegisterBank & DstBank,const MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)839 static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
840                         const MachineRegisterInfo &MRI,
841                         const TargetRegisterInfo &TRI,
842                         const RegisterBankInfo &RBI) {
843   const Register DstReg = I.getOperand(0).getReg();
844   const Register SrcReg = I.getOperand(1).getReg();
845   const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
846   const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
847 
848   // Make sure the size of the source and dest line up.
849   assert(
850       (DstSize == SrcSize ||
851        // Copies are a mean to setup initial types, the number of
852        // bits may not exactly match.
853        (Register::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
854        // Copies are a mean to copy bits around, as long as we are
855        // on the same register class, that's fine. Otherwise, that
856        // means we need some SUBREG_TO_REG or AND & co.
857        (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
858       "Copy with different width?!");
859 
860   // Check the size of the destination.
861   assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
862          "GPRs cannot get more than 64-bit width values");
863 
864   return true;
865 }
866 #endif
867 
868 /// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
869 /// to \p *To.
870 ///
871 /// E.g "To = COPY SrcReg:SubReg"
copySubReg(MachineInstr & I,MachineRegisterInfo & MRI,const RegisterBankInfo & RBI,Register SrcReg,const TargetRegisterClass * To,unsigned SubReg)872 static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
873                        const RegisterBankInfo &RBI, Register SrcReg,
874                        const TargetRegisterClass *To, unsigned SubReg) {
875   assert(SrcReg.isValid() && "Expected a valid source register?");
876   assert(To && "Destination register class cannot be null");
877   assert(SubReg && "Expected a valid subregister");
878 
879   MachineIRBuilder MIB(I);
880   auto SubRegCopy =
881       MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
882   MachineOperand &RegOp = I.getOperand(1);
883   RegOp.setReg(SubRegCopy.getReg(0));
884 
885   // It's possible that the destination register won't be constrained. Make
886   // sure that happens.
887   if (!Register::isPhysicalRegister(I.getOperand(0).getReg()))
888     RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
889 
890   return true;
891 }
892 
893 /// Helper function to get the source and destination register classes for a
894 /// copy. Returns a std::pair containing the source register class for the
895 /// copy, and the destination register class for the copy. If a register class
896 /// cannot be determined, then it will be nullptr.
897 static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getRegClassesForCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)898 getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
899                      MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
900                      const RegisterBankInfo &RBI) {
901   Register DstReg = I.getOperand(0).getReg();
902   Register SrcReg = I.getOperand(1).getReg();
903   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
904   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
905   unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
906   unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
907 
908   // Special casing for cross-bank copies of s1s. We can technically represent
909   // a 1-bit value with any size of register. The minimum size for a GPR is 32
910   // bits. So, we need to put the FPR on 32 bits as well.
911   //
912   // FIXME: I'm not sure if this case holds true outside of copies. If it does,
913   // then we can pull it into the helpers that get the appropriate class for a
914   // register bank. Or make a new helper that carries along some constraint
915   // information.
916   if (SrcRegBank != DstRegBank && (DstSize == 1 && SrcSize == 1))
917     SrcSize = DstSize = 32;
918 
919   return {getMinClassForRegBank(SrcRegBank, SrcSize, true),
920           getMinClassForRegBank(DstRegBank, DstSize, true)};
921 }
922 
selectCopy(MachineInstr & I,const TargetInstrInfo & TII,MachineRegisterInfo & MRI,const TargetRegisterInfo & TRI,const RegisterBankInfo & RBI)923 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
924                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
925                        const RegisterBankInfo &RBI) {
926   Register DstReg = I.getOperand(0).getReg();
927   Register SrcReg = I.getOperand(1).getReg();
928   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
929   const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
930 
931   // Find the correct register classes for the source and destination registers.
932   const TargetRegisterClass *SrcRC;
933   const TargetRegisterClass *DstRC;
934   std::tie(SrcRC, DstRC) = getRegClassesForCopy(I, TII, MRI, TRI, RBI);
935 
936   if (!DstRC) {
937     LLVM_DEBUG(dbgs() << "Unexpected dest size "
938                       << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
939     return false;
940   }
941 
942   // A couple helpers below, for making sure that the copy we produce is valid.
943 
944   // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
945   // to verify that the src and dst are the same size, since that's handled by
946   // the SUBREG_TO_REG.
947   bool KnownValid = false;
948 
949   // Returns true, or asserts if something we don't expect happens. Instead of
950   // returning true, we return isValidCopy() to ensure that we verify the
951   // result.
952   auto CheckCopy = [&]() {
953     // If we have a bitcast or something, we can't have physical registers.
954     assert((I.isCopy() ||
955             (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
956              !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
957            "No phys reg on generic operator!");
958     bool ValidCopy = true;
959 #ifndef NDEBUG
960     ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
961     assert(ValidCopy && "Invalid copy.");
962 #endif
963     (void)KnownValid;
964     return ValidCopy;
965   };
966 
967   // Is this a copy? If so, then we may need to insert a subregister copy.
968   if (I.isCopy()) {
969     // Yes. Check if there's anything to fix up.
970     if (!SrcRC) {
971       LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
972       return false;
973     }
974 
975     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
976     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
977     unsigned SubReg;
978 
979     // If the source bank doesn't support a subregister copy small enough,
980     // then we first need to copy to the destination bank.
981     if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
982       const TargetRegisterClass *DstTempRC =
983           getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
984       getSubRegForClass(DstRC, TRI, SubReg);
985 
986       MachineIRBuilder MIB(I);
987       auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
988       copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
989     } else if (SrcSize > DstSize) {
990       // If the source register is bigger than the destination we need to
991       // perform a subregister copy.
992       const TargetRegisterClass *SubRegRC =
993           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
994       getSubRegForClass(SubRegRC, TRI, SubReg);
995       copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
996     } else if (DstSize > SrcSize) {
997       // If the destination register is bigger than the source we need to do
998       // a promotion using SUBREG_TO_REG.
999       const TargetRegisterClass *PromotionRC =
1000           getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
1001       getSubRegForClass(SrcRC, TRI, SubReg);
1002 
1003       Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
1004       BuildMI(*I.getParent(), I, I.getDebugLoc(),
1005               TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
1006           .addImm(0)
1007           .addUse(SrcReg)
1008           .addImm(SubReg);
1009       MachineOperand &RegOp = I.getOperand(1);
1010       RegOp.setReg(PromoteReg);
1011 
1012       // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
1013       KnownValid = true;
1014     }
1015 
1016     // If the destination is a physical register, then there's nothing to
1017     // change, so we're done.
1018     if (Register::isPhysicalRegister(DstReg))
1019       return CheckCopy();
1020   }
1021 
1022   // No need to constrain SrcReg. It will get constrained when we hit another
1023   // of its use or its defs. Copies do not have constraints.
1024   if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
1025     LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
1026                       << " operand\n");
1027     return false;
1028   }
1029 
1030   // If this a GPR ZEXT that we want to just reduce down into a copy.
1031   // The sizes will be mismatched with the source < 32b but that's ok.
1032   if (I.getOpcode() == TargetOpcode::G_ZEXT) {
1033     I.setDesc(TII.get(AArch64::COPY));
1034     assert(SrcRegBank.getID() == AArch64::GPRRegBankID);
1035     return selectCopy(I, TII, MRI, TRI, RBI);
1036   }
1037 
1038   I.setDesc(TII.get(AArch64::COPY));
1039   return CheckCopy();
1040 }
1041 
selectFPConvOpc(unsigned GenericOpc,LLT DstTy,LLT SrcTy)1042 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
1043   if (!DstTy.isScalar() || !SrcTy.isScalar())
1044     return GenericOpc;
1045 
1046   const unsigned DstSize = DstTy.getSizeInBits();
1047   const unsigned SrcSize = SrcTy.getSizeInBits();
1048 
1049   switch (DstSize) {
1050   case 32:
1051     switch (SrcSize) {
1052     case 32:
1053       switch (GenericOpc) {
1054       case TargetOpcode::G_SITOFP:
1055         return AArch64::SCVTFUWSri;
1056       case TargetOpcode::G_UITOFP:
1057         return AArch64::UCVTFUWSri;
1058       case TargetOpcode::G_FPTOSI:
1059         return AArch64::FCVTZSUWSr;
1060       case TargetOpcode::G_FPTOUI:
1061         return AArch64::FCVTZUUWSr;
1062       default:
1063         return GenericOpc;
1064       }
1065     case 64:
1066       switch (GenericOpc) {
1067       case TargetOpcode::G_SITOFP:
1068         return AArch64::SCVTFUXSri;
1069       case TargetOpcode::G_UITOFP:
1070         return AArch64::UCVTFUXSri;
1071       case TargetOpcode::G_FPTOSI:
1072         return AArch64::FCVTZSUWDr;
1073       case TargetOpcode::G_FPTOUI:
1074         return AArch64::FCVTZUUWDr;
1075       default:
1076         return GenericOpc;
1077       }
1078     default:
1079       return GenericOpc;
1080     }
1081   case 64:
1082     switch (SrcSize) {
1083     case 32:
1084       switch (GenericOpc) {
1085       case TargetOpcode::G_SITOFP:
1086         return AArch64::SCVTFUWDri;
1087       case TargetOpcode::G_UITOFP:
1088         return AArch64::UCVTFUWDri;
1089       case TargetOpcode::G_FPTOSI:
1090         return AArch64::FCVTZSUXSr;
1091       case TargetOpcode::G_FPTOUI:
1092         return AArch64::FCVTZUUXSr;
1093       default:
1094         return GenericOpc;
1095       }
1096     case 64:
1097       switch (GenericOpc) {
1098       case TargetOpcode::G_SITOFP:
1099         return AArch64::SCVTFUXDri;
1100       case TargetOpcode::G_UITOFP:
1101         return AArch64::UCVTFUXDri;
1102       case TargetOpcode::G_FPTOSI:
1103         return AArch64::FCVTZSUXDr;
1104       case TargetOpcode::G_FPTOUI:
1105         return AArch64::FCVTZUUXDr;
1106       default:
1107         return GenericOpc;
1108       }
1109     default:
1110       return GenericOpc;
1111     }
1112   default:
1113     return GenericOpc;
1114   };
1115   return GenericOpc;
1116 }
1117 
1118 MachineInstr *
emitSelect(Register Dst,Register True,Register False,AArch64CC::CondCode CC,MachineIRBuilder & MIB) const1119 AArch64InstructionSelector::emitSelect(Register Dst, Register True,
1120                                        Register False, AArch64CC::CondCode CC,
1121                                        MachineIRBuilder &MIB) const {
1122   MachineRegisterInfo &MRI = *MIB.getMRI();
1123   assert(RBI.getRegBank(False, MRI, TRI)->getID() ==
1124              RBI.getRegBank(True, MRI, TRI)->getID() &&
1125          "Expected both select operands to have the same regbank?");
1126   LLT Ty = MRI.getType(True);
1127   if (Ty.isVector())
1128     return nullptr;
1129   const unsigned Size = Ty.getSizeInBits();
1130   assert((Size == 32 || Size == 64) &&
1131          "Expected 32 bit or 64 bit select only?");
1132   const bool Is32Bit = Size == 32;
1133   if (RBI.getRegBank(True, MRI, TRI)->getID() != AArch64::GPRRegBankID) {
1134     unsigned Opc = Is32Bit ? AArch64::FCSELSrrr : AArch64::FCSELDrrr;
1135     auto FCSel = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1136     constrainSelectedInstRegOperands(*FCSel, TII, TRI, RBI);
1137     return &*FCSel;
1138   }
1139 
1140   // By default, we'll try and emit a CSEL.
1141   unsigned Opc = Is32Bit ? AArch64::CSELWr : AArch64::CSELXr;
1142   bool Optimized = false;
1143   auto TryFoldBinOpIntoSelect = [&Opc, Is32Bit, &CC, &MRI,
1144                                  &Optimized](Register &Reg, Register &OtherReg,
1145                                              bool Invert) {
1146     if (Optimized)
1147       return false;
1148 
1149     // Attempt to fold:
1150     //
1151     // %sub = G_SUB 0, %x
1152     // %select = G_SELECT cc, %reg, %sub
1153     //
1154     // Into:
1155     // %select = CSNEG %reg, %x, cc
1156     Register MatchReg;
1157     if (mi_match(Reg, MRI, m_Neg(m_Reg(MatchReg)))) {
1158       Opc = Is32Bit ? AArch64::CSNEGWr : AArch64::CSNEGXr;
1159       Reg = MatchReg;
1160       if (Invert) {
1161         CC = AArch64CC::getInvertedCondCode(CC);
1162         std::swap(Reg, OtherReg);
1163       }
1164       return true;
1165     }
1166 
1167     // Attempt to fold:
1168     //
1169     // %xor = G_XOR %x, -1
1170     // %select = G_SELECT cc, %reg, %xor
1171     //
1172     // Into:
1173     // %select = CSINV %reg, %x, cc
1174     if (mi_match(Reg, MRI, m_Not(m_Reg(MatchReg)))) {
1175       Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1176       Reg = MatchReg;
1177       if (Invert) {
1178         CC = AArch64CC::getInvertedCondCode(CC);
1179         std::swap(Reg, OtherReg);
1180       }
1181       return true;
1182     }
1183 
1184     // Attempt to fold:
1185     //
1186     // %add = G_ADD %x, 1
1187     // %select = G_SELECT cc, %reg, %add
1188     //
1189     // Into:
1190     // %select = CSINC %reg, %x, cc
1191     if (mi_match(Reg, MRI,
1192                  m_any_of(m_GAdd(m_Reg(MatchReg), m_SpecificICst(1)),
1193                           m_GPtrAdd(m_Reg(MatchReg), m_SpecificICst(1))))) {
1194       Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1195       Reg = MatchReg;
1196       if (Invert) {
1197         CC = AArch64CC::getInvertedCondCode(CC);
1198         std::swap(Reg, OtherReg);
1199       }
1200       return true;
1201     }
1202 
1203     return false;
1204   };
1205 
1206   // Helper lambda which tries to use CSINC/CSINV for the instruction when its
1207   // true/false values are constants.
1208   // FIXME: All of these patterns already exist in tablegen. We should be
1209   // able to import these.
1210   auto TryOptSelectCst = [&Opc, &True, &False, &CC, Is32Bit, &MRI,
1211                           &Optimized]() {
1212     if (Optimized)
1213       return false;
1214     auto TrueCst = getIConstantVRegValWithLookThrough(True, MRI);
1215     auto FalseCst = getIConstantVRegValWithLookThrough(False, MRI);
1216     if (!TrueCst && !FalseCst)
1217       return false;
1218 
1219     Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
1220     if (TrueCst && FalseCst) {
1221       int64_t T = TrueCst->Value.getSExtValue();
1222       int64_t F = FalseCst->Value.getSExtValue();
1223 
1224       if (T == 0 && F == 1) {
1225         // G_SELECT cc, 0, 1 -> CSINC zreg, zreg, cc
1226         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1227         True = ZReg;
1228         False = ZReg;
1229         return true;
1230       }
1231 
1232       if (T == 0 && F == -1) {
1233         // G_SELECT cc 0, -1 -> CSINV zreg, zreg cc
1234         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1235         True = ZReg;
1236         False = ZReg;
1237         return true;
1238       }
1239     }
1240 
1241     if (TrueCst) {
1242       int64_t T = TrueCst->Value.getSExtValue();
1243       if (T == 1) {
1244         // G_SELECT cc, 1, f -> CSINC f, zreg, inv_cc
1245         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1246         True = False;
1247         False = ZReg;
1248         CC = AArch64CC::getInvertedCondCode(CC);
1249         return true;
1250       }
1251 
1252       if (T == -1) {
1253         // G_SELECT cc, -1, f -> CSINV f, zreg, inv_cc
1254         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1255         True = False;
1256         False = ZReg;
1257         CC = AArch64CC::getInvertedCondCode(CC);
1258         return true;
1259       }
1260     }
1261 
1262     if (FalseCst) {
1263       int64_t F = FalseCst->Value.getSExtValue();
1264       if (F == 1) {
1265         // G_SELECT cc, t, 1 -> CSINC t, zreg, cc
1266         Opc = Is32Bit ? AArch64::CSINCWr : AArch64::CSINCXr;
1267         False = ZReg;
1268         return true;
1269       }
1270 
1271       if (F == -1) {
1272         // G_SELECT cc, t, -1 -> CSINC t, zreg, cc
1273         Opc = Is32Bit ? AArch64::CSINVWr : AArch64::CSINVXr;
1274         False = ZReg;
1275         return true;
1276       }
1277     }
1278     return false;
1279   };
1280 
1281   Optimized |= TryFoldBinOpIntoSelect(False, True, /*Invert = */ false);
1282   Optimized |= TryFoldBinOpIntoSelect(True, False, /*Invert = */ true);
1283   Optimized |= TryOptSelectCst();
1284   auto SelectInst = MIB.buildInstr(Opc, {Dst}, {True, False}).addImm(CC);
1285   constrainSelectedInstRegOperands(*SelectInst, TII, TRI, RBI);
1286   return &*SelectInst;
1287 }
1288 
changeICMPPredToAArch64CC(CmpInst::Predicate P)1289 static AArch64CC::CondCode changeICMPPredToAArch64CC(CmpInst::Predicate P) {
1290   switch (P) {
1291   default:
1292     llvm_unreachable("Unknown condition code!");
1293   case CmpInst::ICMP_NE:
1294     return AArch64CC::NE;
1295   case CmpInst::ICMP_EQ:
1296     return AArch64CC::EQ;
1297   case CmpInst::ICMP_SGT:
1298     return AArch64CC::GT;
1299   case CmpInst::ICMP_SGE:
1300     return AArch64CC::GE;
1301   case CmpInst::ICMP_SLT:
1302     return AArch64CC::LT;
1303   case CmpInst::ICMP_SLE:
1304     return AArch64CC::LE;
1305   case CmpInst::ICMP_UGT:
1306     return AArch64CC::HI;
1307   case CmpInst::ICMP_UGE:
1308     return AArch64CC::HS;
1309   case CmpInst::ICMP_ULT:
1310     return AArch64CC::LO;
1311   case CmpInst::ICMP_ULE:
1312     return AArch64CC::LS;
1313   }
1314 }
1315 
1316 /// Return a register which can be used as a bit to test in a TB(N)Z.
getTestBitReg(Register Reg,uint64_t & Bit,bool & Invert,MachineRegisterInfo & MRI)1317 static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
1318                               MachineRegisterInfo &MRI) {
1319   assert(Reg.isValid() && "Expected valid register!");
1320   bool HasZext = false;
1321   while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
1322     unsigned Opc = MI->getOpcode();
1323 
1324     if (!MI->getOperand(0).isReg() ||
1325         !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
1326       break;
1327 
1328     // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
1329     //
1330     // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
1331     // on the truncated x is the same as the bit number on x.
1332     if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
1333         Opc == TargetOpcode::G_TRUNC) {
1334       if (Opc == TargetOpcode::G_ZEXT)
1335         HasZext = true;
1336 
1337       Register NextReg = MI->getOperand(1).getReg();
1338       // Did we find something worth folding?
1339       if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
1340         break;
1341 
1342       // NextReg is worth folding. Keep looking.
1343       Reg = NextReg;
1344       continue;
1345     }
1346 
1347     // Attempt to find a suitable operation with a constant on one side.
1348     Optional<uint64_t> C;
1349     Register TestReg;
1350     switch (Opc) {
1351     default:
1352       break;
1353     case TargetOpcode::G_AND:
1354     case TargetOpcode::G_XOR: {
1355       TestReg = MI->getOperand(1).getReg();
1356       Register ConstantReg = MI->getOperand(2).getReg();
1357       auto VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1358       if (!VRegAndVal) {
1359         // AND commutes, check the other side for a constant.
1360         // FIXME: Can we canonicalize the constant so that it's always on the
1361         // same side at some point earlier?
1362         std::swap(ConstantReg, TestReg);
1363         VRegAndVal = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
1364       }
1365       if (VRegAndVal) {
1366         if (HasZext)
1367           C = VRegAndVal->Value.getZExtValue();
1368         else
1369           C = VRegAndVal->Value.getSExtValue();
1370       }
1371       break;
1372     }
1373     case TargetOpcode::G_ASHR:
1374     case TargetOpcode::G_LSHR:
1375     case TargetOpcode::G_SHL: {
1376       TestReg = MI->getOperand(1).getReg();
1377       auto VRegAndVal =
1378           getIConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
1379       if (VRegAndVal)
1380         C = VRegAndVal->Value.getSExtValue();
1381       break;
1382     }
1383     }
1384 
1385     // Didn't find a constant or viable register. Bail out of the loop.
1386     if (!C || !TestReg.isValid())
1387       break;
1388 
1389     // We found a suitable instruction with a constant. Check to see if we can
1390     // walk through the instruction.
1391     Register NextReg;
1392     unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
1393     switch (Opc) {
1394     default:
1395       break;
1396     case TargetOpcode::G_AND:
1397       // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
1398       if ((*C >> Bit) & 1)
1399         NextReg = TestReg;
1400       break;
1401     case TargetOpcode::G_SHL:
1402       // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
1403       // the type of the register.
1404       if (*C <= Bit && (Bit - *C) < TestRegSize) {
1405         NextReg = TestReg;
1406         Bit = Bit - *C;
1407       }
1408       break;
1409     case TargetOpcode::G_ASHR:
1410       // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
1411       // in x
1412       NextReg = TestReg;
1413       Bit = Bit + *C;
1414       if (Bit >= TestRegSize)
1415         Bit = TestRegSize - 1;
1416       break;
1417     case TargetOpcode::G_LSHR:
1418       // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
1419       if ((Bit + *C) < TestRegSize) {
1420         NextReg = TestReg;
1421         Bit = Bit + *C;
1422       }
1423       break;
1424     case TargetOpcode::G_XOR:
1425       // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
1426       // appropriate.
1427       //
1428       // e.g. If x' = xor x, c, and the b-th bit is set in c then
1429       //
1430       // tbz x', b -> tbnz x, b
1431       //
1432       // Because x' only has the b-th bit set if x does not.
1433       if ((*C >> Bit) & 1)
1434         Invert = !Invert;
1435       NextReg = TestReg;
1436       break;
1437     }
1438 
1439     // Check if we found anything worth folding.
1440     if (!NextReg.isValid())
1441       return Reg;
1442     Reg = NextReg;
1443   }
1444 
1445   return Reg;
1446 }
1447 
emitTestBit(Register TestReg,uint64_t Bit,bool IsNegative,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1448 MachineInstr *AArch64InstructionSelector::emitTestBit(
1449     Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
1450     MachineIRBuilder &MIB) const {
1451   assert(TestReg.isValid());
1452   assert(ProduceNonFlagSettingCondBr &&
1453          "Cannot emit TB(N)Z with speculation tracking!");
1454   MachineRegisterInfo &MRI = *MIB.getMRI();
1455 
1456   // Attempt to optimize the test bit by walking over instructions.
1457   TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
1458   LLT Ty = MRI.getType(TestReg);
1459   unsigned Size = Ty.getSizeInBits();
1460   assert(!Ty.isVector() && "Expected a scalar!");
1461   assert(Bit < 64 && "Bit is too large!");
1462 
1463   // When the test register is a 64-bit register, we have to narrow to make
1464   // TBNZW work.
1465   bool UseWReg = Bit < 32;
1466   unsigned NecessarySize = UseWReg ? 32 : 64;
1467   if (Size != NecessarySize)
1468     TestReg = moveScalarRegClass(
1469         TestReg, UseWReg ? AArch64::GPR32RegClass : AArch64::GPR64RegClass,
1470         MIB);
1471 
1472   static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
1473                                           {AArch64::TBZW, AArch64::TBNZW}};
1474   unsigned Opc = OpcTable[UseWReg][IsNegative];
1475   auto TestBitMI =
1476       MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
1477   constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
1478   return &*TestBitMI;
1479 }
1480 
tryOptAndIntoCompareBranch(MachineInstr & AndInst,bool Invert,MachineBasicBlock * DstMBB,MachineIRBuilder & MIB) const1481 bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
1482     MachineInstr &AndInst, bool Invert, MachineBasicBlock *DstMBB,
1483     MachineIRBuilder &MIB) const {
1484   assert(AndInst.getOpcode() == TargetOpcode::G_AND && "Expected G_AND only?");
1485   // Given something like this:
1486   //
1487   //  %x = ...Something...
1488   //  %one = G_CONSTANT i64 1
1489   //  %zero = G_CONSTANT i64 0
1490   //  %and = G_AND %x, %one
1491   //  %cmp = G_ICMP intpred(ne), %and, %zero
1492   //  %cmp_trunc = G_TRUNC %cmp
1493   //  G_BRCOND %cmp_trunc, %bb.3
1494   //
1495   // We want to try and fold the AND into the G_BRCOND and produce either a
1496   // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
1497   //
1498   // In this case, we'd get
1499   //
1500   // TBNZ %x %bb.3
1501   //
1502 
1503   // Check if the AND has a constant on its RHS which we can use as a mask.
1504   // If it's a power of 2, then it's the same as checking a specific bit.
1505   // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
1506   auto MaybeBit = getIConstantVRegValWithLookThrough(
1507       AndInst.getOperand(2).getReg(), *MIB.getMRI());
1508   if (!MaybeBit)
1509     return false;
1510 
1511   int32_t Bit = MaybeBit->Value.exactLogBase2();
1512   if (Bit < 0)
1513     return false;
1514 
1515   Register TestReg = AndInst.getOperand(1).getReg();
1516 
1517   // Emit a TB(N)Z.
1518   emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
1519   return true;
1520 }
1521 
emitCBZ(Register CompareReg,bool IsNegative,MachineBasicBlock * DestMBB,MachineIRBuilder & MIB) const1522 MachineInstr *AArch64InstructionSelector::emitCBZ(Register CompareReg,
1523                                                   bool IsNegative,
1524                                                   MachineBasicBlock *DestMBB,
1525                                                   MachineIRBuilder &MIB) const {
1526   assert(ProduceNonFlagSettingCondBr && "CBZ does not set flags!");
1527   MachineRegisterInfo &MRI = *MIB.getMRI();
1528   assert(RBI.getRegBank(CompareReg, MRI, TRI)->getID() ==
1529              AArch64::GPRRegBankID &&
1530          "Expected GPRs only?");
1531   auto Ty = MRI.getType(CompareReg);
1532   unsigned Width = Ty.getSizeInBits();
1533   assert(!Ty.isVector() && "Expected scalar only?");
1534   assert(Width <= 64 && "Expected width to be at most 64?");
1535   static const unsigned OpcTable[2][2] = {{AArch64::CBZW, AArch64::CBZX},
1536                                           {AArch64::CBNZW, AArch64::CBNZX}};
1537   unsigned Opc = OpcTable[IsNegative][Width == 64];
1538   auto BranchMI = MIB.buildInstr(Opc, {}, {CompareReg}).addMBB(DestMBB);
1539   constrainSelectedInstRegOperands(*BranchMI, TII, TRI, RBI);
1540   return &*BranchMI;
1541 }
1542 
selectCompareBranchFedByFCmp(MachineInstr & I,MachineInstr & FCmp,MachineIRBuilder & MIB) const1543 bool AArch64InstructionSelector::selectCompareBranchFedByFCmp(
1544     MachineInstr &I, MachineInstr &FCmp, MachineIRBuilder &MIB) const {
1545   assert(FCmp.getOpcode() == TargetOpcode::G_FCMP);
1546   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1547   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
1548   // totally clean.  Some of them require two branches to implement.
1549   auto Pred = (CmpInst::Predicate)FCmp.getOperand(1).getPredicate();
1550   emitFPCompare(FCmp.getOperand(2).getReg(), FCmp.getOperand(3).getReg(), MIB,
1551                 Pred);
1552   AArch64CC::CondCode CC1, CC2;
1553   changeFCMPPredToAArch64CC(static_cast<CmpInst::Predicate>(Pred), CC1, CC2);
1554   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1555   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC1).addMBB(DestMBB);
1556   if (CC2 != AArch64CC::AL)
1557     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC2).addMBB(DestMBB);
1558   I.eraseFromParent();
1559   return true;
1560 }
1561 
tryOptCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1562 bool AArch64InstructionSelector::tryOptCompareBranchFedByICmp(
1563     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1564   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1565   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1566   // Attempt to optimize the G_BRCOND + G_ICMP into a TB(N)Z/CB(N)Z.
1567   //
1568   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1569   // instructions will not be produced, as they are conditional branch
1570   // instructions that do not set flags.
1571   if (!ProduceNonFlagSettingCondBr)
1572     return false;
1573 
1574   MachineRegisterInfo &MRI = *MIB.getMRI();
1575   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1576   auto Pred =
1577       static_cast<CmpInst::Predicate>(ICmp.getOperand(1).getPredicate());
1578   Register LHS = ICmp.getOperand(2).getReg();
1579   Register RHS = ICmp.getOperand(3).getReg();
1580 
1581   // We're allowed to emit a TB(N)Z/CB(N)Z. Try to do that.
1582   auto VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1583   MachineInstr *AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1584 
1585   // When we can emit a TB(N)Z, prefer that.
1586   //
1587   // Handle non-commutative condition codes first.
1588   // Note that we don't want to do this when we have a G_AND because it can
1589   // become a tst. The tst will make the test bit in the TB(N)Z redundant.
1590   if (VRegAndVal && !AndInst) {
1591     int64_t C = VRegAndVal->Value.getSExtValue();
1592 
1593     // When we have a greater-than comparison, we can just test if the msb is
1594     // zero.
1595     if (C == -1 && Pred == CmpInst::ICMP_SGT) {
1596       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1597       emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
1598       I.eraseFromParent();
1599       return true;
1600     }
1601 
1602     // When we have a less than comparison, we can just test if the msb is not
1603     // zero.
1604     if (C == 0 && Pred == CmpInst::ICMP_SLT) {
1605       uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
1606       emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
1607       I.eraseFromParent();
1608       return true;
1609     }
1610   }
1611 
1612   // Attempt to handle commutative condition codes. Right now, that's only
1613   // eq/ne.
1614   if (ICmpInst::isEquality(Pred)) {
1615     if (!VRegAndVal) {
1616       std::swap(RHS, LHS);
1617       VRegAndVal = getIConstantVRegValWithLookThrough(RHS, MRI);
1618       AndInst = getOpcodeDef(TargetOpcode::G_AND, LHS, MRI);
1619     }
1620 
1621     if (VRegAndVal && VRegAndVal->Value == 0) {
1622       // If there's a G_AND feeding into this branch, try to fold it away by
1623       // emitting a TB(N)Z instead.
1624       //
1625       // Note: If we have LT, then it *is* possible to fold, but it wouldn't be
1626       // beneficial. When we have an AND and LT, we need a TST/ANDS, so folding
1627       // would be redundant.
1628       if (AndInst &&
1629           tryOptAndIntoCompareBranch(
1630               *AndInst, /*Invert = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB)) {
1631         I.eraseFromParent();
1632         return true;
1633       }
1634 
1635       // Otherwise, try to emit a CB(N)Z instead.
1636       auto LHSTy = MRI.getType(LHS);
1637       if (!LHSTy.isVector() && LHSTy.getSizeInBits() <= 64) {
1638         emitCBZ(LHS, /*IsNegative = */ Pred == CmpInst::ICMP_NE, DestMBB, MIB);
1639         I.eraseFromParent();
1640         return true;
1641       }
1642     }
1643   }
1644 
1645   return false;
1646 }
1647 
selectCompareBranchFedByICmp(MachineInstr & I,MachineInstr & ICmp,MachineIRBuilder & MIB) const1648 bool AArch64InstructionSelector::selectCompareBranchFedByICmp(
1649     MachineInstr &I, MachineInstr &ICmp, MachineIRBuilder &MIB) const {
1650   assert(ICmp.getOpcode() == TargetOpcode::G_ICMP);
1651   assert(I.getOpcode() == TargetOpcode::G_BRCOND);
1652   if (tryOptCompareBranchFedByICmp(I, ICmp, MIB))
1653     return true;
1654 
1655   // Couldn't optimize. Emit a compare + a Bcc.
1656   MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
1657   auto PredOp = ICmp.getOperand(1);
1658   emitIntegerCompare(ICmp.getOperand(2), ICmp.getOperand(3), PredOp, MIB);
1659   const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
1660       static_cast<CmpInst::Predicate>(PredOp.getPredicate()));
1661   MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
1662   I.eraseFromParent();
1663   return true;
1664 }
1665 
selectCompareBranch(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI)1666 bool AArch64InstructionSelector::selectCompareBranch(
1667     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) {
1668   Register CondReg = I.getOperand(0).getReg();
1669   MachineInstr *CCMI = MRI.getVRegDef(CondReg);
1670   if (CCMI->getOpcode() == TargetOpcode::G_TRUNC) {
1671     CondReg = CCMI->getOperand(1).getReg();
1672     CCMI = MRI.getVRegDef(CondReg);
1673   }
1674 
1675   // Try to select the G_BRCOND using whatever is feeding the condition if
1676   // possible.
1677   unsigned CCMIOpc = CCMI->getOpcode();
1678   if (CCMIOpc == TargetOpcode::G_FCMP)
1679     return selectCompareBranchFedByFCmp(I, *CCMI, MIB);
1680   if (CCMIOpc == TargetOpcode::G_ICMP)
1681     return selectCompareBranchFedByICmp(I, *CCMI, MIB);
1682 
1683   // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
1684   // instructions will not be produced, as they are conditional branch
1685   // instructions that do not set flags.
1686   if (ProduceNonFlagSettingCondBr) {
1687     emitTestBit(CondReg, /*Bit = */ 0, /*IsNegative = */ true,
1688                 I.getOperand(1).getMBB(), MIB);
1689     I.eraseFromParent();
1690     return true;
1691   }
1692 
1693   // Can't emit TB(N)Z/CB(N)Z. Emit a tst + bcc instead.
1694   auto TstMI =
1695       MIB.buildInstr(AArch64::ANDSWri, {LLT::scalar(32)}, {CondReg}).addImm(1);
1696   constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
1697   auto Bcc = MIB.buildInstr(AArch64::Bcc)
1698                  .addImm(AArch64CC::EQ)
1699                  .addMBB(I.getOperand(1).getMBB());
1700   I.eraseFromParent();
1701   return constrainSelectedInstRegOperands(*Bcc, TII, TRI, RBI);
1702 }
1703 
1704 /// Returns the element immediate value of a vector shift operand if found.
1705 /// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
getVectorShiftImm(Register Reg,MachineRegisterInfo & MRI)1706 static Optional<int64_t> getVectorShiftImm(Register Reg,
1707                                            MachineRegisterInfo &MRI) {
1708   assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
1709   MachineInstr *OpMI = MRI.getVRegDef(Reg);
1710   assert(OpMI && "Expected to find a vreg def for vector shift operand");
1711   return getAArch64VectorSplatScalar(*OpMI, MRI);
1712 }
1713 
1714 /// Matches and returns the shift immediate value for a SHL instruction given
1715 /// a shift operand.
getVectorSHLImm(LLT SrcTy,Register Reg,MachineRegisterInfo & MRI)1716 static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
1717   Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
1718   if (!ShiftImm)
1719     return None;
1720   // Check the immediate is in range for a SHL.
1721   int64_t Imm = *ShiftImm;
1722   if (Imm < 0)
1723     return None;
1724   switch (SrcTy.getElementType().getSizeInBits()) {
1725   default:
1726     LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
1727     return None;
1728   case 8:
1729     if (Imm > 7)
1730       return None;
1731     break;
1732   case 16:
1733     if (Imm > 15)
1734       return None;
1735     break;
1736   case 32:
1737     if (Imm > 31)
1738       return None;
1739     break;
1740   case 64:
1741     if (Imm > 63)
1742       return None;
1743     break;
1744   }
1745   return Imm;
1746 }
1747 
selectVectorSHL(MachineInstr & I,MachineRegisterInfo & MRI)1748 bool AArch64InstructionSelector::selectVectorSHL(MachineInstr &I,
1749                                                  MachineRegisterInfo &MRI) {
1750   assert(I.getOpcode() == TargetOpcode::G_SHL);
1751   Register DstReg = I.getOperand(0).getReg();
1752   const LLT Ty = MRI.getType(DstReg);
1753   Register Src1Reg = I.getOperand(1).getReg();
1754   Register Src2Reg = I.getOperand(2).getReg();
1755 
1756   if (!Ty.isVector())
1757     return false;
1758 
1759   // Check if we have a vector of constants on RHS that we can select as the
1760   // immediate form.
1761   Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
1762 
1763   unsigned Opc = 0;
1764   if (Ty == LLT::fixed_vector(2, 64)) {
1765     Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
1766   } else if (Ty == LLT::fixed_vector(4, 32)) {
1767     Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
1768   } else if (Ty == LLT::fixed_vector(2, 32)) {
1769     Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
1770   } else if (Ty == LLT::fixed_vector(4, 16)) {
1771     Opc = ImmVal ? AArch64::SHLv4i16_shift : AArch64::USHLv4i16;
1772   } else if (Ty == LLT::fixed_vector(8, 16)) {
1773     Opc = ImmVal ? AArch64::SHLv8i16_shift : AArch64::USHLv8i16;
1774   } else if (Ty == LLT::fixed_vector(16, 8)) {
1775     Opc = ImmVal ? AArch64::SHLv16i8_shift : AArch64::USHLv16i8;
1776   } else if (Ty == LLT::fixed_vector(8, 8)) {
1777     Opc = ImmVal ? AArch64::SHLv8i8_shift : AArch64::USHLv8i8;
1778   } else {
1779     LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
1780     return false;
1781   }
1782 
1783   auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
1784   if (ImmVal)
1785     Shl.addImm(*ImmVal);
1786   else
1787     Shl.addUse(Src2Reg);
1788   constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
1789   I.eraseFromParent();
1790   return true;
1791 }
1792 
selectVectorAshrLshr(MachineInstr & I,MachineRegisterInfo & MRI)1793 bool AArch64InstructionSelector::selectVectorAshrLshr(
1794     MachineInstr &I, MachineRegisterInfo &MRI) {
1795   assert(I.getOpcode() == TargetOpcode::G_ASHR ||
1796          I.getOpcode() == TargetOpcode::G_LSHR);
1797   Register DstReg = I.getOperand(0).getReg();
1798   const LLT Ty = MRI.getType(DstReg);
1799   Register Src1Reg = I.getOperand(1).getReg();
1800   Register Src2Reg = I.getOperand(2).getReg();
1801 
1802   if (!Ty.isVector())
1803     return false;
1804 
1805   bool IsASHR = I.getOpcode() == TargetOpcode::G_ASHR;
1806 
1807   // We expect the immediate case to be lowered in the PostLegalCombiner to
1808   // AArch64ISD::VASHR or AArch64ISD::VLSHR equivalents.
1809 
1810   // There is not a shift right register instruction, but the shift left
1811   // register instruction takes a signed value, where negative numbers specify a
1812   // right shift.
1813 
1814   unsigned Opc = 0;
1815   unsigned NegOpc = 0;
1816   const TargetRegisterClass *RC =
1817       getRegClassForTypeOnBank(Ty, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
1818   if (Ty == LLT::fixed_vector(2, 64)) {
1819     Opc = IsASHR ? AArch64::SSHLv2i64 : AArch64::USHLv2i64;
1820     NegOpc = AArch64::NEGv2i64;
1821   } else if (Ty == LLT::fixed_vector(4, 32)) {
1822     Opc = IsASHR ? AArch64::SSHLv4i32 : AArch64::USHLv4i32;
1823     NegOpc = AArch64::NEGv4i32;
1824   } else if (Ty == LLT::fixed_vector(2, 32)) {
1825     Opc = IsASHR ? AArch64::SSHLv2i32 : AArch64::USHLv2i32;
1826     NegOpc = AArch64::NEGv2i32;
1827   } else if (Ty == LLT::fixed_vector(4, 16)) {
1828     Opc = IsASHR ? AArch64::SSHLv4i16 : AArch64::USHLv4i16;
1829     NegOpc = AArch64::NEGv4i16;
1830   } else if (Ty == LLT::fixed_vector(8, 16)) {
1831     Opc = IsASHR ? AArch64::SSHLv8i16 : AArch64::USHLv8i16;
1832     NegOpc = AArch64::NEGv8i16;
1833   } else if (Ty == LLT::fixed_vector(16, 8)) {
1834     Opc = IsASHR ? AArch64::SSHLv16i8 : AArch64::USHLv16i8;
1835     NegOpc = AArch64::NEGv16i8;
1836   } else if (Ty == LLT::fixed_vector(8, 8)) {
1837     Opc = IsASHR ? AArch64::SSHLv8i8 : AArch64::USHLv8i8;
1838     NegOpc = AArch64::NEGv8i8;
1839   } else {
1840     LLVM_DEBUG(dbgs() << "Unhandled G_ASHR type");
1841     return false;
1842   }
1843 
1844   auto Neg = MIB.buildInstr(NegOpc, {RC}, {Src2Reg});
1845   constrainSelectedInstRegOperands(*Neg, TII, TRI, RBI);
1846   auto SShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Neg});
1847   constrainSelectedInstRegOperands(*SShl, TII, TRI, RBI);
1848   I.eraseFromParent();
1849   return true;
1850 }
1851 
selectVaStartAAPCS(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1852 bool AArch64InstructionSelector::selectVaStartAAPCS(
1853     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1854   return false;
1855 }
1856 
selectVaStartDarwin(MachineInstr & I,MachineFunction & MF,MachineRegisterInfo & MRI) const1857 bool AArch64InstructionSelector::selectVaStartDarwin(
1858     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
1859   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
1860   Register ListReg = I.getOperand(0).getReg();
1861 
1862   Register ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1863 
1864   auto MIB =
1865       BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
1866           .addDef(ArgsAddrReg)
1867           .addFrameIndex(FuncInfo->getVarArgsStackIndex())
1868           .addImm(0)
1869           .addImm(0);
1870 
1871   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1872 
1873   MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
1874             .addUse(ArgsAddrReg)
1875             .addUse(ListReg)
1876             .addImm(0)
1877             .addMemOperand(*I.memoperands_begin());
1878 
1879   constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
1880   I.eraseFromParent();
1881   return true;
1882 }
1883 
materializeLargeCMVal(MachineInstr & I,const Value * V,unsigned OpFlags)1884 void AArch64InstructionSelector::materializeLargeCMVal(
1885     MachineInstr &I, const Value *V, unsigned OpFlags) {
1886   MachineBasicBlock &MBB = *I.getParent();
1887   MachineFunction &MF = *MBB.getParent();
1888   MachineRegisterInfo &MRI = MF.getRegInfo();
1889 
1890   auto MovZ = MIB.buildInstr(AArch64::MOVZXi, {&AArch64::GPR64RegClass}, {});
1891   MovZ->addOperand(MF, I.getOperand(1));
1892   MovZ->getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_G0 |
1893                                      AArch64II::MO_NC);
1894   MovZ->addOperand(MF, MachineOperand::CreateImm(0));
1895   constrainSelectedInstRegOperands(*MovZ, TII, TRI, RBI);
1896 
1897   auto BuildMovK = [&](Register SrcReg, unsigned char Flags, unsigned Offset,
1898                        Register ForceDstReg) {
1899     Register DstReg = ForceDstReg
1900                           ? ForceDstReg
1901                           : MRI.createVirtualRegister(&AArch64::GPR64RegClass);
1902     auto MovI = MIB.buildInstr(AArch64::MOVKXi).addDef(DstReg).addUse(SrcReg);
1903     if (auto *GV = dyn_cast<GlobalValue>(V)) {
1904       MovI->addOperand(MF, MachineOperand::CreateGA(
1905                                GV, MovZ->getOperand(1).getOffset(), Flags));
1906     } else {
1907       MovI->addOperand(
1908           MF, MachineOperand::CreateBA(cast<BlockAddress>(V),
1909                                        MovZ->getOperand(1).getOffset(), Flags));
1910     }
1911     MovI->addOperand(MF, MachineOperand::CreateImm(Offset));
1912     constrainSelectedInstRegOperands(*MovI, TII, TRI, RBI);
1913     return DstReg;
1914   };
1915   Register DstReg = BuildMovK(MovZ.getReg(0),
1916                               AArch64II::MO_G1 | AArch64II::MO_NC, 16, 0);
1917   DstReg = BuildMovK(DstReg, AArch64II::MO_G2 | AArch64II::MO_NC, 32, 0);
1918   BuildMovK(DstReg, AArch64II::MO_G3, 48, I.getOperand(0).getReg());
1919 }
1920 
preISelLower(MachineInstr & I)1921 bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
1922   MachineBasicBlock &MBB = *I.getParent();
1923   MachineFunction &MF = *MBB.getParent();
1924   MachineRegisterInfo &MRI = MF.getRegInfo();
1925 
1926   switch (I.getOpcode()) {
1927   case TargetOpcode::G_SHL:
1928   case TargetOpcode::G_ASHR:
1929   case TargetOpcode::G_LSHR: {
1930     // These shifts are legalized to have 64 bit shift amounts because we want
1931     // to take advantage of the existing imported selection patterns that assume
1932     // the immediates are s64s. However, if the shifted type is 32 bits and for
1933     // some reason we receive input GMIR that has an s64 shift amount that's not
1934     // a G_CONSTANT, insert a truncate so that we can still select the s32
1935     // register-register variant.
1936     Register SrcReg = I.getOperand(1).getReg();
1937     Register ShiftReg = I.getOperand(2).getReg();
1938     const LLT ShiftTy = MRI.getType(ShiftReg);
1939     const LLT SrcTy = MRI.getType(SrcReg);
1940     if (SrcTy.isVector())
1941       return false;
1942     assert(!ShiftTy.isVector() && "unexpected vector shift ty");
1943     if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
1944       return false;
1945     auto *AmtMI = MRI.getVRegDef(ShiftReg);
1946     assert(AmtMI && "could not find a vreg definition for shift amount");
1947     if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
1948       // Insert a subregister copy to implement a 64->32 trunc
1949       auto Trunc = MIB.buildInstr(TargetOpcode::COPY, {SrcTy}, {})
1950                        .addReg(ShiftReg, 0, AArch64::sub_32);
1951       MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
1952       I.getOperand(2).setReg(Trunc.getReg(0));
1953     }
1954     return true;
1955   }
1956   case TargetOpcode::G_STORE: {
1957     bool Changed = contractCrossBankCopyIntoStore(I, MRI);
1958     MachineOperand &SrcOp = I.getOperand(0);
1959     if (MRI.getType(SrcOp.getReg()).isPointer()) {
1960       // Allow matching with imported patterns for stores of pointers. Unlike
1961       // G_LOAD/G_PTR_ADD, we may not have selected all users. So, emit a copy
1962       // and constrain.
1963       auto Copy = MIB.buildCopy(LLT::scalar(64), SrcOp);
1964       Register NewSrc = Copy.getReg(0);
1965       SrcOp.setReg(NewSrc);
1966       RBI.constrainGenericRegister(NewSrc, AArch64::GPR64RegClass, MRI);
1967       Changed = true;
1968     }
1969     return Changed;
1970   }
1971   case TargetOpcode::G_PTR_ADD:
1972     return convertPtrAddToAdd(I, MRI);
1973   case TargetOpcode::G_LOAD: {
1974     // For scalar loads of pointers, we try to convert the dest type from p0
1975     // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
1976     // conversion, this should be ok because all users should have been
1977     // selected already, so the type doesn't matter for them.
1978     Register DstReg = I.getOperand(0).getReg();
1979     const LLT DstTy = MRI.getType(DstReg);
1980     if (!DstTy.isPointer())
1981       return false;
1982     MRI.setType(DstReg, LLT::scalar(64));
1983     return true;
1984   }
1985   case AArch64::G_DUP: {
1986     // Convert the type from p0 to s64 to help selection.
1987     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
1988     if (!DstTy.getElementType().isPointer())
1989       return false;
1990     auto NewSrc = MIB.buildCopy(LLT::scalar(64), I.getOperand(1).getReg());
1991     MRI.setType(I.getOperand(0).getReg(),
1992                 DstTy.changeElementType(LLT::scalar(64)));
1993     MRI.setRegClass(NewSrc.getReg(0), &AArch64::GPR64RegClass);
1994     I.getOperand(1).setReg(NewSrc.getReg(0));
1995     return true;
1996   }
1997   case TargetOpcode::G_UITOFP:
1998   case TargetOpcode::G_SITOFP: {
1999     // If both source and destination regbanks are FPR, then convert the opcode
2000     // to G_SITOF so that the importer can select it to an fpr variant.
2001     // Otherwise, it ends up matching an fpr/gpr variant and adding a cross-bank
2002     // copy.
2003     Register SrcReg = I.getOperand(1).getReg();
2004     LLT SrcTy = MRI.getType(SrcReg);
2005     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2006     if (SrcTy.isVector() || SrcTy.getSizeInBits() != DstTy.getSizeInBits())
2007       return false;
2008 
2009     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::FPRRegBankID) {
2010       if (I.getOpcode() == TargetOpcode::G_SITOFP)
2011         I.setDesc(TII.get(AArch64::G_SITOF));
2012       else
2013         I.setDesc(TII.get(AArch64::G_UITOF));
2014       return true;
2015     }
2016     return false;
2017   }
2018   default:
2019     return false;
2020   }
2021 }
2022 
2023 /// This lowering tries to look for G_PTR_ADD instructions and then converts
2024 /// them to a standard G_ADD with a COPY on the source.
2025 ///
2026 /// The motivation behind this is to expose the add semantics to the imported
2027 /// tablegen patterns. We shouldn't need to check for uses being loads/stores,
2028 /// because the selector works bottom up, uses before defs. By the time we
2029 /// end up trying to select a G_PTR_ADD, we should have already attempted to
2030 /// fold this into addressing modes and were therefore unsuccessful.
convertPtrAddToAdd(MachineInstr & I,MachineRegisterInfo & MRI)2031 bool AArch64InstructionSelector::convertPtrAddToAdd(
2032     MachineInstr &I, MachineRegisterInfo &MRI) {
2033   assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
2034   Register DstReg = I.getOperand(0).getReg();
2035   Register AddOp1Reg = I.getOperand(1).getReg();
2036   const LLT PtrTy = MRI.getType(DstReg);
2037   if (PtrTy.getAddressSpace() != 0)
2038     return false;
2039 
2040   const LLT CastPtrTy =
2041       PtrTy.isVector() ? LLT::fixed_vector(2, 64) : LLT::scalar(64);
2042   auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
2043   // Set regbanks on the registers.
2044   if (PtrTy.isVector())
2045     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
2046   else
2047     MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
2048 
2049   // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
2050   // %dst(intty) = G_ADD %intbase, off
2051   I.setDesc(TII.get(TargetOpcode::G_ADD));
2052   MRI.setType(DstReg, CastPtrTy);
2053   I.getOperand(1).setReg(PtrToInt.getReg(0));
2054   if (!select(*PtrToInt)) {
2055     LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
2056     return false;
2057   }
2058 
2059   // Also take the opportunity here to try to do some optimization.
2060   // Try to convert this into a G_SUB if the offset is a 0-x negate idiom.
2061   Register NegatedReg;
2062   if (!mi_match(I.getOperand(2).getReg(), MRI, m_Neg(m_Reg(NegatedReg))))
2063     return true;
2064   I.getOperand(2).setReg(NegatedReg);
2065   I.setDesc(TII.get(TargetOpcode::G_SUB));
2066   return true;
2067 }
2068 
earlySelectSHL(MachineInstr & I,MachineRegisterInfo & MRI)2069 bool AArch64InstructionSelector::earlySelectSHL(MachineInstr &I,
2070                                                 MachineRegisterInfo &MRI) {
2071   // We try to match the immediate variant of LSL, which is actually an alias
2072   // for a special case of UBFM. Otherwise, we fall back to the imported
2073   // selector which will match the register variant.
2074   assert(I.getOpcode() == TargetOpcode::G_SHL && "unexpected op");
2075   const auto &MO = I.getOperand(2);
2076   auto VRegAndVal = getIConstantVRegVal(MO.getReg(), MRI);
2077   if (!VRegAndVal)
2078     return false;
2079 
2080   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2081   if (DstTy.isVector())
2082     return false;
2083   bool Is64Bit = DstTy.getSizeInBits() == 64;
2084   auto Imm1Fn = Is64Bit ? selectShiftA_64(MO) : selectShiftA_32(MO);
2085   auto Imm2Fn = Is64Bit ? selectShiftB_64(MO) : selectShiftB_32(MO);
2086 
2087   if (!Imm1Fn || !Imm2Fn)
2088     return false;
2089 
2090   auto NewI =
2091       MIB.buildInstr(Is64Bit ? AArch64::UBFMXri : AArch64::UBFMWri,
2092                      {I.getOperand(0).getReg()}, {I.getOperand(1).getReg()});
2093 
2094   for (auto &RenderFn : *Imm1Fn)
2095     RenderFn(NewI);
2096   for (auto &RenderFn : *Imm2Fn)
2097     RenderFn(NewI);
2098 
2099   I.eraseFromParent();
2100   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
2101 }
2102 
contractCrossBankCopyIntoStore(MachineInstr & I,MachineRegisterInfo & MRI)2103 bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
2104     MachineInstr &I, MachineRegisterInfo &MRI) {
2105   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
2106   // If we're storing a scalar, it doesn't matter what register bank that
2107   // scalar is on. All that matters is the size.
2108   //
2109   // So, if we see something like this (with a 32-bit scalar as an example):
2110   //
2111   // %x:gpr(s32) = ... something ...
2112   // %y:fpr(s32) = COPY %x:gpr(s32)
2113   // G_STORE %y:fpr(s32)
2114   //
2115   // We can fix this up into something like this:
2116   //
2117   // G_STORE %x:gpr(s32)
2118   //
2119   // And then continue the selection process normally.
2120   Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
2121   if (!DefDstReg.isValid())
2122     return false;
2123   LLT DefDstTy = MRI.getType(DefDstReg);
2124   Register StoreSrcReg = I.getOperand(0).getReg();
2125   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
2126 
2127   // If we get something strange like a physical register, then we shouldn't
2128   // go any further.
2129   if (!DefDstTy.isValid())
2130     return false;
2131 
2132   // Are the source and dst types the same size?
2133   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
2134     return false;
2135 
2136   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
2137       RBI.getRegBank(DefDstReg, MRI, TRI))
2138     return false;
2139 
2140   // We have a cross-bank copy, which is entering a store. Let's fold it.
2141   I.getOperand(0).setReg(DefDstReg);
2142   return true;
2143 }
2144 
earlySelect(MachineInstr & I)2145 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) {
2146   assert(I.getParent() && "Instruction should be in a basic block!");
2147   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2148 
2149   MachineBasicBlock &MBB = *I.getParent();
2150   MachineFunction &MF = *MBB.getParent();
2151   MachineRegisterInfo &MRI = MF.getRegInfo();
2152 
2153   switch (I.getOpcode()) {
2154   case AArch64::G_DUP: {
2155     // Before selecting a DUP instruction, check if it is better selected as a
2156     // MOV or load from a constant pool.
2157     Register Src = I.getOperand(1).getReg();
2158     auto ValAndVReg = getIConstantVRegValWithLookThrough(Src, MRI);
2159     if (!ValAndVReg)
2160       return false;
2161     LLVMContext &Ctx = MF.getFunction().getContext();
2162     Register Dst = I.getOperand(0).getReg();
2163     auto *CV = ConstantDataVector::getSplat(
2164         MRI.getType(Dst).getNumElements(),
2165         ConstantInt::get(Type::getIntNTy(Ctx, MRI.getType(Src).getSizeInBits()),
2166                          ValAndVReg->Value));
2167     if (!emitConstantVector(Dst, CV, MIB, MRI))
2168       return false;
2169     I.eraseFromParent();
2170     return true;
2171   }
2172   case TargetOpcode::G_SEXT:
2173     // Check for i64 sext(i32 vector_extract) prior to tablegen to select SMOV
2174     // over a normal extend.
2175     if (selectUSMovFromExtend(I, MRI))
2176       return true;
2177     return false;
2178   case TargetOpcode::G_BR:
2179     return false;
2180   case TargetOpcode::G_SHL:
2181     return earlySelectSHL(I, MRI);
2182   case TargetOpcode::G_CONSTANT: {
2183     bool IsZero = false;
2184     if (I.getOperand(1).isCImm())
2185       IsZero = I.getOperand(1).getCImm()->getZExtValue() == 0;
2186     else if (I.getOperand(1).isImm())
2187       IsZero = I.getOperand(1).getImm() == 0;
2188 
2189     if (!IsZero)
2190       return false;
2191 
2192     Register DefReg = I.getOperand(0).getReg();
2193     LLT Ty = MRI.getType(DefReg);
2194     if (Ty.getSizeInBits() == 64) {
2195       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
2196       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
2197     } else if (Ty.getSizeInBits() == 32) {
2198       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
2199       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
2200     } else
2201       return false;
2202 
2203     I.setDesc(TII.get(TargetOpcode::COPY));
2204     return true;
2205   }
2206 
2207   case TargetOpcode::G_ADD: {
2208     // Check if this is being fed by a G_ICMP on either side.
2209     //
2210     // (cmp pred, x, y) + z
2211     //
2212     // In the above case, when the cmp is true, we increment z by 1. So, we can
2213     // fold the add into the cset for the cmp by using cinc.
2214     //
2215     // FIXME: This would probably be a lot nicer in PostLegalizerLowering.
2216     Register X = I.getOperand(1).getReg();
2217 
2218     // Only handle scalars. Scalar G_ICMP is only legal for s32, so bail out
2219     // early if we see it.
2220     LLT Ty = MRI.getType(X);
2221     if (Ty.isVector() || Ty.getSizeInBits() != 32)
2222       return false;
2223 
2224     Register CmpReg = I.getOperand(2).getReg();
2225     MachineInstr *Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2226     if (!Cmp) {
2227       std::swap(X, CmpReg);
2228       Cmp = getOpcodeDef(TargetOpcode::G_ICMP, CmpReg, MRI);
2229       if (!Cmp)
2230         return false;
2231     }
2232     auto Pred =
2233         static_cast<CmpInst::Predicate>(Cmp->getOperand(1).getPredicate());
2234     emitIntegerCompare(Cmp->getOperand(2), Cmp->getOperand(3),
2235                        Cmp->getOperand(1), MIB);
2236     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB, X);
2237     I.eraseFromParent();
2238     return true;
2239   }
2240   case TargetOpcode::G_OR: {
2241     // Look for operations that take the lower `Width=Size-ShiftImm` bits of
2242     // `ShiftSrc` and insert them into the upper `Width` bits of `MaskSrc` via
2243     // shifting and masking that we can replace with a BFI (encoded as a BFM).
2244     Register Dst = I.getOperand(0).getReg();
2245     LLT Ty = MRI.getType(Dst);
2246 
2247     if (!Ty.isScalar())
2248       return false;
2249 
2250     unsigned Size = Ty.getSizeInBits();
2251     if (Size != 32 && Size != 64)
2252       return false;
2253 
2254     Register ShiftSrc;
2255     int64_t ShiftImm;
2256     Register MaskSrc;
2257     int64_t MaskImm;
2258     if (!mi_match(
2259             Dst, MRI,
2260             m_GOr(m_OneNonDBGUse(m_GShl(m_Reg(ShiftSrc), m_ICst(ShiftImm))),
2261                   m_OneNonDBGUse(m_GAnd(m_Reg(MaskSrc), m_ICst(MaskImm))))))
2262       return false;
2263 
2264     if (ShiftImm > Size || ((1ULL << ShiftImm) - 1ULL) != uint64_t(MaskImm))
2265       return false;
2266 
2267     int64_t Immr = Size - ShiftImm;
2268     int64_t Imms = Size - ShiftImm - 1;
2269     unsigned Opc = Size == 32 ? AArch64::BFMWri : AArch64::BFMXri;
2270     emitInstr(Opc, {Dst}, {MaskSrc, ShiftSrc, Immr, Imms}, MIB);
2271     I.eraseFromParent();
2272     return true;
2273   }
2274   default:
2275     return false;
2276   }
2277 }
2278 
select(MachineInstr & I)2279 bool AArch64InstructionSelector::select(MachineInstr &I) {
2280   assert(I.getParent() && "Instruction should be in a basic block!");
2281   assert(I.getParent()->getParent() && "Instruction should be in a function!");
2282 
2283   MachineBasicBlock &MBB = *I.getParent();
2284   MachineFunction &MF = *MBB.getParent();
2285   MachineRegisterInfo &MRI = MF.getRegInfo();
2286 
2287   const AArch64Subtarget *Subtarget =
2288       &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
2289   if (Subtarget->requiresStrictAlign()) {
2290     // We don't support this feature yet.
2291     LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
2292     return false;
2293   }
2294 
2295   MIB.setInstrAndDebugLoc(I);
2296 
2297   unsigned Opcode = I.getOpcode();
2298   // G_PHI requires same handling as PHI
2299   if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
2300     // Certain non-generic instructions also need some special handling.
2301 
2302     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
2303       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2304 
2305     if (Opcode == TargetOpcode::PHI || Opcode == TargetOpcode::G_PHI) {
2306       const Register DefReg = I.getOperand(0).getReg();
2307       const LLT DefTy = MRI.getType(DefReg);
2308 
2309       const RegClassOrRegBank &RegClassOrBank =
2310         MRI.getRegClassOrRegBank(DefReg);
2311 
2312       const TargetRegisterClass *DefRC
2313         = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2314       if (!DefRC) {
2315         if (!DefTy.isValid()) {
2316           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
2317           return false;
2318         }
2319         const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2320         DefRC = getRegClassForTypeOnBank(DefTy, RB, RBI);
2321         if (!DefRC) {
2322           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
2323           return false;
2324         }
2325       }
2326 
2327       I.setDesc(TII.get(TargetOpcode::PHI));
2328 
2329       return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
2330     }
2331 
2332     if (I.isCopy())
2333       return selectCopy(I, TII, MRI, TRI, RBI);
2334 
2335     return true;
2336   }
2337 
2338 
2339   if (I.getNumOperands() != I.getNumExplicitOperands()) {
2340     LLVM_DEBUG(
2341         dbgs() << "Generic instruction has unexpected implicit operands\n");
2342     return false;
2343   }
2344 
2345   // Try to do some lowering before we start instruction selecting. These
2346   // lowerings are purely transformations on the input G_MIR and so selection
2347   // must continue after any modification of the instruction.
2348   if (preISelLower(I)) {
2349     Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
2350   }
2351 
2352   // There may be patterns where the importer can't deal with them optimally,
2353   // but does select it to a suboptimal sequence so our custom C++ selection
2354   // code later never has a chance to work on it. Therefore, we have an early
2355   // selection attempt here to give priority to certain selection routines
2356   // over the imported ones.
2357   if (earlySelect(I))
2358     return true;
2359 
2360   if (selectImpl(I, *CoverageInfo))
2361     return true;
2362 
2363   LLT Ty =
2364       I.getOperand(0).isReg() ? MRI.getType(I.getOperand(0).getReg()) : LLT{};
2365 
2366   switch (Opcode) {
2367   case TargetOpcode::G_SBFX:
2368   case TargetOpcode::G_UBFX: {
2369     static const unsigned OpcTable[2][2] = {
2370         {AArch64::UBFMWri, AArch64::UBFMXri},
2371         {AArch64::SBFMWri, AArch64::SBFMXri}};
2372     bool IsSigned = Opcode == TargetOpcode::G_SBFX;
2373     unsigned Size = Ty.getSizeInBits();
2374     unsigned Opc = OpcTable[IsSigned][Size == 64];
2375     auto Cst1 =
2376         getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), MRI);
2377     assert(Cst1 && "Should have gotten a constant for src 1?");
2378     auto Cst2 =
2379         getIConstantVRegValWithLookThrough(I.getOperand(3).getReg(), MRI);
2380     assert(Cst2 && "Should have gotten a constant for src 2?");
2381     auto LSB = Cst1->Value.getZExtValue();
2382     auto Width = Cst2->Value.getZExtValue();
2383     auto BitfieldInst =
2384         MIB.buildInstr(Opc, {I.getOperand(0)}, {I.getOperand(1)})
2385             .addImm(LSB)
2386             .addImm(LSB + Width - 1);
2387     I.eraseFromParent();
2388     return constrainSelectedInstRegOperands(*BitfieldInst, TII, TRI, RBI);
2389   }
2390   case TargetOpcode::G_BRCOND:
2391     return selectCompareBranch(I, MF, MRI);
2392 
2393   case TargetOpcode::G_BRINDIRECT: {
2394     I.setDesc(TII.get(AArch64::BR));
2395     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2396   }
2397 
2398   case TargetOpcode::G_BRJT:
2399     return selectBrJT(I, MRI);
2400 
2401   case AArch64::G_ADD_LOW: {
2402     // This op may have been separated from it's ADRP companion by the localizer
2403     // or some other code motion pass. Given that many CPUs will try to
2404     // macro fuse these operations anyway, select this into a MOVaddr pseudo
2405     // which will later be expanded into an ADRP+ADD pair after scheduling.
2406     MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
2407     if (BaseMI->getOpcode() != AArch64::ADRP) {
2408       I.setDesc(TII.get(AArch64::ADDXri));
2409       I.addOperand(MachineOperand::CreateImm(0));
2410       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2411     }
2412     assert(TM.getCodeModel() == CodeModel::Small &&
2413            "Expected small code model");
2414     auto Op1 = BaseMI->getOperand(1);
2415     auto Op2 = I.getOperand(2);
2416     auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
2417                        .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
2418                                          Op1.getTargetFlags())
2419                        .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
2420                                          Op2.getTargetFlags());
2421     I.eraseFromParent();
2422     return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
2423   }
2424 
2425   case TargetOpcode::G_BSWAP: {
2426     // Handle vector types for G_BSWAP directly.
2427     Register DstReg = I.getOperand(0).getReg();
2428     LLT DstTy = MRI.getType(DstReg);
2429 
2430     // We should only get vector types here; everything else is handled by the
2431     // importer right now.
2432     if (!DstTy.isVector() || DstTy.getSizeInBits() > 128) {
2433       LLVM_DEBUG(dbgs() << "Dst type for G_BSWAP currently unsupported.\n");
2434       return false;
2435     }
2436 
2437     // Only handle 4 and 2 element vectors for now.
2438     // TODO: 16-bit elements.
2439     unsigned NumElts = DstTy.getNumElements();
2440     if (NumElts != 4 && NumElts != 2) {
2441       LLVM_DEBUG(dbgs() << "Unsupported number of elements for G_BSWAP.\n");
2442       return false;
2443     }
2444 
2445     // Choose the correct opcode for the supported types. Right now, that's
2446     // v2s32, v4s32, and v2s64.
2447     unsigned Opc = 0;
2448     unsigned EltSize = DstTy.getElementType().getSizeInBits();
2449     if (EltSize == 32)
2450       Opc = (DstTy.getNumElements() == 2) ? AArch64::REV32v8i8
2451                                           : AArch64::REV32v16i8;
2452     else if (EltSize == 64)
2453       Opc = AArch64::REV64v16i8;
2454 
2455     // We should always get something by the time we get here...
2456     assert(Opc != 0 && "Didn't get an opcode for G_BSWAP?");
2457 
2458     I.setDesc(TII.get(Opc));
2459     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2460   }
2461 
2462   case TargetOpcode::G_FCONSTANT:
2463   case TargetOpcode::G_CONSTANT: {
2464     const bool isFP = Opcode == TargetOpcode::G_FCONSTANT;
2465 
2466     const LLT s8 = LLT::scalar(8);
2467     const LLT s16 = LLT::scalar(16);
2468     const LLT s32 = LLT::scalar(32);
2469     const LLT s64 = LLT::scalar(64);
2470     const LLT s128 = LLT::scalar(128);
2471     const LLT p0 = LLT::pointer(0, 64);
2472 
2473     const Register DefReg = I.getOperand(0).getReg();
2474     const LLT DefTy = MRI.getType(DefReg);
2475     const unsigned DefSize = DefTy.getSizeInBits();
2476     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2477 
2478     // FIXME: Redundant check, but even less readable when factored out.
2479     if (isFP) {
2480       if (Ty != s16 && Ty != s32 && Ty != s64 && Ty != s128) {
2481         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2482                           << " constant, expected: " << s16 << " or " << s32
2483                           << " or " << s64 << " or " << s128 << '\n');
2484         return false;
2485       }
2486 
2487       if (RB.getID() != AArch64::FPRRegBankID) {
2488         LLVM_DEBUG(dbgs() << "Unable to materialize FP " << Ty
2489                           << " constant on bank: " << RB
2490                           << ", expected: FPR\n");
2491         return false;
2492       }
2493 
2494       // The case when we have 0.0 is covered by tablegen. Reject it here so we
2495       // can be sure tablegen works correctly and isn't rescued by this code.
2496       // 0.0 is not covered by tablegen for FP128. So we will handle this
2497       // scenario in the code here.
2498       if (DefSize != 128 && I.getOperand(1).getFPImm()->isExactlyValue(0.0))
2499         return false;
2500     } else {
2501       // s32 and s64 are covered by tablegen.
2502       if (Ty != p0 && Ty != s8 && Ty != s16) {
2503         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2504                           << " constant, expected: " << s32 << ", " << s64
2505                           << ", or " << p0 << '\n');
2506         return false;
2507       }
2508 
2509       if (RB.getID() != AArch64::GPRRegBankID) {
2510         LLVM_DEBUG(dbgs() << "Unable to materialize integer " << Ty
2511                           << " constant on bank: " << RB
2512                           << ", expected: GPR\n");
2513         return false;
2514       }
2515     }
2516 
2517     if (isFP) {
2518       const TargetRegisterClass &FPRRC = *getMinClassForRegBank(RB, DefSize);
2519       // For 16, 64, and 128b values, emit a constant pool load.
2520       switch (DefSize) {
2521       default:
2522         llvm_unreachable("Unexpected destination size for G_FCONSTANT?");
2523       case 32:
2524         // For s32, use a cp load if we have optsize/minsize.
2525         if (!shouldOptForSize(&MF))
2526           break;
2527         LLVM_FALLTHROUGH;
2528       case 16:
2529       case 64:
2530       case 128: {
2531         auto *FPImm = I.getOperand(1).getFPImm();
2532         auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
2533         if (!LoadMI) {
2534           LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
2535           return false;
2536         }
2537         MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
2538         I.eraseFromParent();
2539         return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
2540       }
2541       }
2542 
2543       // Either emit a FMOV, or emit a copy to emit a normal mov.
2544       assert(DefSize == 32 &&
2545              "Expected constant pool loads for all sizes other than 32!");
2546       const Register DefGPRReg =
2547           MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2548       MachineOperand &RegOp = I.getOperand(0);
2549       RegOp.setReg(DefGPRReg);
2550       MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2551       MIB.buildCopy({DefReg}, {DefGPRReg});
2552 
2553       if (!RBI.constrainGenericRegister(DefReg, FPRRC, MRI)) {
2554         LLVM_DEBUG(dbgs() << "Failed to constrain G_FCONSTANT def operand\n");
2555         return false;
2556       }
2557 
2558       MachineOperand &ImmOp = I.getOperand(1);
2559       // FIXME: Is going through int64_t always correct?
2560       ImmOp.ChangeToImmediate(
2561           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
2562     } else if (I.getOperand(1).isCImm()) {
2563       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
2564       I.getOperand(1).ChangeToImmediate(Val);
2565     } else if (I.getOperand(1).isImm()) {
2566       uint64_t Val = I.getOperand(1).getImm();
2567       I.getOperand(1).ChangeToImmediate(Val);
2568     }
2569 
2570     const unsigned MovOpc =
2571         DefSize == 64 ? AArch64::MOVi64imm : AArch64::MOVi32imm;
2572     I.setDesc(TII.get(MovOpc));
2573     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2574     return true;
2575   }
2576   case TargetOpcode::G_EXTRACT: {
2577     Register DstReg = I.getOperand(0).getReg();
2578     Register SrcReg = I.getOperand(1).getReg();
2579     LLT SrcTy = MRI.getType(SrcReg);
2580     LLT DstTy = MRI.getType(DstReg);
2581     (void)DstTy;
2582     unsigned SrcSize = SrcTy.getSizeInBits();
2583 
2584     if (SrcTy.getSizeInBits() > 64) {
2585       // This should be an extract of an s128, which is like a vector extract.
2586       if (SrcTy.getSizeInBits() != 128)
2587         return false;
2588       // Only support extracting 64 bits from an s128 at the moment.
2589       if (DstTy.getSizeInBits() != 64)
2590         return false;
2591 
2592       unsigned Offset = I.getOperand(2).getImm();
2593       if (Offset % 64 != 0)
2594         return false;
2595 
2596       // Check we have the right regbank always.
2597       const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2598       const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2599       assert(SrcRB.getID() == DstRB.getID() && "Wrong extract regbank!");
2600 
2601       if (SrcRB.getID() == AArch64::GPRRegBankID) {
2602         MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
2603             .addUse(SrcReg, 0, Offset == 0 ? AArch64::sube64 : AArch64::subo64);
2604         I.eraseFromParent();
2605         return true;
2606       }
2607 
2608       // Emit the same code as a vector extract.
2609       // Offset must be a multiple of 64.
2610       unsigned LaneIdx = Offset / 64;
2611       MachineInstr *Extract = emitExtractVectorElt(
2612           DstReg, DstRB, LLT::scalar(64), SrcReg, LaneIdx, MIB);
2613       if (!Extract)
2614         return false;
2615       I.eraseFromParent();
2616       return true;
2617     }
2618 
2619     I.setDesc(TII.get(SrcSize == 64 ? AArch64::UBFMXri : AArch64::UBFMWri));
2620     MachineInstrBuilder(MF, I).addImm(I.getOperand(2).getImm() +
2621                                       Ty.getSizeInBits() - 1);
2622 
2623     if (SrcSize < 64) {
2624       assert(SrcSize == 32 && DstTy.getSizeInBits() == 16 &&
2625              "unexpected G_EXTRACT types");
2626       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2627     }
2628 
2629     DstReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2630     MIB.setInsertPt(MIB.getMBB(), std::next(I.getIterator()));
2631     MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
2632         .addReg(DstReg, 0, AArch64::sub_32);
2633     RBI.constrainGenericRegister(I.getOperand(0).getReg(),
2634                                  AArch64::GPR32RegClass, MRI);
2635     I.getOperand(0).setReg(DstReg);
2636 
2637     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2638   }
2639 
2640   case TargetOpcode::G_INSERT: {
2641     LLT SrcTy = MRI.getType(I.getOperand(2).getReg());
2642     LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2643     unsigned DstSize = DstTy.getSizeInBits();
2644     // Larger inserts are vectors, same-size ones should be something else by
2645     // now (split up or turned into COPYs).
2646     if (Ty.getSizeInBits() > 64 || SrcTy.getSizeInBits() > 32)
2647       return false;
2648 
2649     I.setDesc(TII.get(DstSize == 64 ? AArch64::BFMXri : AArch64::BFMWri));
2650     unsigned LSB = I.getOperand(3).getImm();
2651     unsigned Width = MRI.getType(I.getOperand(2).getReg()).getSizeInBits();
2652     I.getOperand(3).setImm((DstSize - LSB) % DstSize);
2653     MachineInstrBuilder(MF, I).addImm(Width - 1);
2654 
2655     if (DstSize < 64) {
2656       assert(DstSize == 32 && SrcTy.getSizeInBits() == 16 &&
2657              "unexpected G_INSERT types");
2658       return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2659     }
2660 
2661     Register SrcReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
2662     BuildMI(MBB, I.getIterator(), I.getDebugLoc(),
2663             TII.get(AArch64::SUBREG_TO_REG))
2664         .addDef(SrcReg)
2665         .addImm(0)
2666         .addUse(I.getOperand(2).getReg())
2667         .addImm(AArch64::sub_32);
2668     RBI.constrainGenericRegister(I.getOperand(2).getReg(),
2669                                  AArch64::GPR32RegClass, MRI);
2670     I.getOperand(2).setReg(SrcReg);
2671 
2672     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2673   }
2674   case TargetOpcode::G_FRAME_INDEX: {
2675     // allocas and G_FRAME_INDEX are only supported in addrspace(0).
2676     if (Ty != LLT::pointer(0, 64)) {
2677       LLVM_DEBUG(dbgs() << "G_FRAME_INDEX pointer has type: " << Ty
2678                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2679       return false;
2680     }
2681     I.setDesc(TII.get(AArch64::ADDXri));
2682 
2683     // MOs for a #0 shifted immediate.
2684     I.addOperand(MachineOperand::CreateImm(0));
2685     I.addOperand(MachineOperand::CreateImm(0));
2686 
2687     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2688   }
2689 
2690   case TargetOpcode::G_GLOBAL_VALUE: {
2691     auto GV = I.getOperand(1).getGlobal();
2692     if (GV->isThreadLocal())
2693       return selectTLSGlobalValue(I, MRI);
2694 
2695     unsigned OpFlags = STI.ClassifyGlobalReference(GV, TM);
2696     if (OpFlags & AArch64II::MO_GOT) {
2697       I.setDesc(TII.get(AArch64::LOADgot));
2698       I.getOperand(1).setTargetFlags(OpFlags);
2699     } else if (TM.getCodeModel() == CodeModel::Large) {
2700       // Materialize the global using movz/movk instructions.
2701       materializeLargeCMVal(I, GV, OpFlags);
2702       I.eraseFromParent();
2703       return true;
2704     } else if (TM.getCodeModel() == CodeModel::Tiny) {
2705       I.setDesc(TII.get(AArch64::ADR));
2706       I.getOperand(1).setTargetFlags(OpFlags);
2707     } else {
2708       I.setDesc(TII.get(AArch64::MOVaddr));
2709       I.getOperand(1).setTargetFlags(OpFlags | AArch64II::MO_PAGE);
2710       MachineInstrBuilder MIB(MF, I);
2711       MIB.addGlobalAddress(GV, I.getOperand(1).getOffset(),
2712                            OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
2713     }
2714     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2715   }
2716 
2717   case TargetOpcode::G_ZEXTLOAD:
2718   case TargetOpcode::G_LOAD:
2719   case TargetOpcode::G_STORE: {
2720     GLoadStore &LdSt = cast<GLoadStore>(I);
2721     bool IsZExtLoad = I.getOpcode() == TargetOpcode::G_ZEXTLOAD;
2722     LLT PtrTy = MRI.getType(LdSt.getPointerReg());
2723 
2724     if (PtrTy != LLT::pointer(0, 64)) {
2725       LLVM_DEBUG(dbgs() << "Load/Store pointer has type: " << PtrTy
2726                         << ", expected: " << LLT::pointer(0, 64) << '\n');
2727       return false;
2728     }
2729 
2730     uint64_t MemSizeInBytes = LdSt.getMemSize();
2731     unsigned MemSizeInBits = LdSt.getMemSizeInBits();
2732     AtomicOrdering Order = LdSt.getMMO().getSuccessOrdering();
2733 
2734     // Need special instructions for atomics that affect ordering.
2735     if (Order != AtomicOrdering::NotAtomic &&
2736         Order != AtomicOrdering::Unordered &&
2737         Order != AtomicOrdering::Monotonic) {
2738       assert(!isa<GZExtLoad>(LdSt));
2739       if (MemSizeInBytes > 64)
2740         return false;
2741 
2742       if (isa<GLoad>(LdSt)) {
2743         static unsigned Opcodes[] = {AArch64::LDARB, AArch64::LDARH,
2744                                      AArch64::LDARW, AArch64::LDARX};
2745         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2746       } else {
2747         static unsigned Opcodes[] = {AArch64::STLRB, AArch64::STLRH,
2748                                      AArch64::STLRW, AArch64::STLRX};
2749         I.setDesc(TII.get(Opcodes[Log2_32(MemSizeInBytes)]));
2750       }
2751       constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2752       return true;
2753     }
2754 
2755 #ifndef NDEBUG
2756     const Register PtrReg = LdSt.getPointerReg();
2757     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
2758     // Sanity-check the pointer register.
2759     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
2760            "Load/Store pointer operand isn't a GPR");
2761     assert(MRI.getType(PtrReg).isPointer() &&
2762            "Load/Store pointer operand isn't a pointer");
2763 #endif
2764 
2765     const Register ValReg = LdSt.getReg(0);
2766     const LLT ValTy = MRI.getType(ValReg);
2767     const RegisterBank &RB = *RBI.getRegBank(ValReg, MRI, TRI);
2768 
2769     // The code below doesn't support truncating stores, so we need to split it
2770     // again.
2771     if (isa<GStore>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2772       unsigned SubReg;
2773       LLT MemTy = LdSt.getMMO().getMemoryType();
2774       auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
2775       if (!getSubRegForClass(RC, TRI, SubReg))
2776         return false;
2777 
2778       // Generate a subreg copy.
2779       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {MemTy}, {})
2780                       .addReg(ValReg, 0, SubReg)
2781                       .getReg(0);
2782       RBI.constrainGenericRegister(Copy, *RC, MRI);
2783       LdSt.getOperand(0).setReg(Copy);
2784     } else if (isa<GLoad>(LdSt) && ValTy.getSizeInBits() > MemSizeInBits) {
2785       // If this is an any-extending load from the FPR bank, split it into a regular
2786       // load + extend.
2787       if (RB.getID() == AArch64::FPRRegBankID) {
2788         unsigned SubReg;
2789         LLT MemTy = LdSt.getMMO().getMemoryType();
2790         auto *RC = getRegClassForTypeOnBank(MemTy, RB, RBI);
2791         if (!getSubRegForClass(RC, TRI, SubReg))
2792           return false;
2793         Register OldDst = LdSt.getReg(0);
2794         Register NewDst =
2795             MRI.createGenericVirtualRegister(LdSt.getMMO().getMemoryType());
2796         LdSt.getOperand(0).setReg(NewDst);
2797         MRI.setRegBank(NewDst, RB);
2798         // Generate a SUBREG_TO_REG to extend it.
2799         MIB.setInsertPt(MIB.getMBB(), std::next(LdSt.getIterator()));
2800         MIB.buildInstr(AArch64::SUBREG_TO_REG, {OldDst}, {})
2801             .addImm(0)
2802             .addUse(NewDst)
2803             .addImm(SubReg);
2804         auto SubRegRC = getRegClassForTypeOnBank(MRI.getType(OldDst), RB, RBI);
2805         RBI.constrainGenericRegister(OldDst, *SubRegRC, MRI);
2806         MIB.setInstr(LdSt);
2807       }
2808     }
2809 
2810     // Helper lambda for partially selecting I. Either returns the original
2811     // instruction with an updated opcode, or a new instruction.
2812     auto SelectLoadStoreAddressingMode = [&]() -> MachineInstr * {
2813       bool IsStore = isa<GStore>(I);
2814       const unsigned NewOpc =
2815           selectLoadStoreUIOp(I.getOpcode(), RB.getID(), MemSizeInBits);
2816       if (NewOpc == I.getOpcode())
2817         return nullptr;
2818       // Check if we can fold anything into the addressing mode.
2819       auto AddrModeFns =
2820           selectAddrModeIndexed(I.getOperand(1), MemSizeInBytes);
2821       if (!AddrModeFns) {
2822         // Can't fold anything. Use the original instruction.
2823         I.setDesc(TII.get(NewOpc));
2824         I.addOperand(MachineOperand::CreateImm(0));
2825         return &I;
2826       }
2827 
2828       // Folded something. Create a new instruction and return it.
2829       auto NewInst = MIB.buildInstr(NewOpc, {}, {}, I.getFlags());
2830       Register CurValReg = I.getOperand(0).getReg();
2831       IsStore ? NewInst.addUse(CurValReg) : NewInst.addDef(CurValReg);
2832       NewInst.cloneMemRefs(I);
2833       for (auto &Fn : *AddrModeFns)
2834         Fn(NewInst);
2835       I.eraseFromParent();
2836       return &*NewInst;
2837     };
2838 
2839     MachineInstr *LoadStore = SelectLoadStoreAddressingMode();
2840     if (!LoadStore)
2841       return false;
2842 
2843     // If we're storing a 0, use WZR/XZR.
2844     if (Opcode == TargetOpcode::G_STORE) {
2845       auto CVal = getIConstantVRegValWithLookThrough(
2846           LoadStore->getOperand(0).getReg(), MRI);
2847       if (CVal && CVal->Value == 0) {
2848         switch (LoadStore->getOpcode()) {
2849         case AArch64::STRWui:
2850         case AArch64::STRHHui:
2851         case AArch64::STRBBui:
2852           LoadStore->getOperand(0).setReg(AArch64::WZR);
2853           break;
2854         case AArch64::STRXui:
2855           LoadStore->getOperand(0).setReg(AArch64::XZR);
2856           break;
2857         }
2858       }
2859     }
2860 
2861     if (IsZExtLoad) {
2862       // The zextload from a smaller type to i32 should be handled by the
2863       // importer.
2864       if (MRI.getType(LoadStore->getOperand(0).getReg()).getSizeInBits() != 64)
2865         return false;
2866       // If we have a ZEXTLOAD then change the load's type to be a narrower reg
2867       // and zero_extend with SUBREG_TO_REG.
2868       Register LdReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
2869       Register DstReg = LoadStore->getOperand(0).getReg();
2870       LoadStore->getOperand(0).setReg(LdReg);
2871 
2872       MIB.setInsertPt(MIB.getMBB(), std::next(LoadStore->getIterator()));
2873       MIB.buildInstr(AArch64::SUBREG_TO_REG, {DstReg}, {})
2874           .addImm(0)
2875           .addUse(LdReg)
2876           .addImm(AArch64::sub_32);
2877       constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2878       return RBI.constrainGenericRegister(DstReg, AArch64::GPR64allRegClass,
2879                                           MRI);
2880     }
2881     return constrainSelectedInstRegOperands(*LoadStore, TII, TRI, RBI);
2882   }
2883 
2884   case TargetOpcode::G_SMULH:
2885   case TargetOpcode::G_UMULH: {
2886     // Reject the various things we don't support yet.
2887     if (unsupportedBinOp(I, RBI, MRI, TRI))
2888       return false;
2889 
2890     const Register DefReg = I.getOperand(0).getReg();
2891     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2892 
2893     if (RB.getID() != AArch64::GPRRegBankID) {
2894       LLVM_DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
2895       return false;
2896     }
2897 
2898     if (Ty != LLT::scalar(64)) {
2899       LLVM_DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
2900                         << ", expected: " << LLT::scalar(64) << '\n');
2901       return false;
2902     }
2903 
2904     unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
2905                                                              : AArch64::UMULHrr;
2906     I.setDesc(TII.get(NewOpc));
2907 
2908     // Now that we selected an opcode, we need to constrain the register
2909     // operands to use appropriate classes.
2910     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2911   }
2912   case TargetOpcode::G_LSHR:
2913   case TargetOpcode::G_ASHR:
2914     if (MRI.getType(I.getOperand(0).getReg()).isVector())
2915       return selectVectorAshrLshr(I, MRI);
2916     LLVM_FALLTHROUGH;
2917   case TargetOpcode::G_SHL:
2918     if (Opcode == TargetOpcode::G_SHL &&
2919         MRI.getType(I.getOperand(0).getReg()).isVector())
2920       return selectVectorSHL(I, MRI);
2921     LLVM_FALLTHROUGH;
2922   case TargetOpcode::G_FADD:
2923   case TargetOpcode::G_FSUB:
2924   case TargetOpcode::G_FMUL:
2925   case TargetOpcode::G_FDIV:
2926   case TargetOpcode::G_OR: {
2927     // Reject the various things we don't support yet.
2928     if (unsupportedBinOp(I, RBI, MRI, TRI))
2929       return false;
2930 
2931     const unsigned OpSize = Ty.getSizeInBits();
2932 
2933     const Register DefReg = I.getOperand(0).getReg();
2934     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
2935 
2936     const unsigned NewOpc = selectBinaryOp(I.getOpcode(), RB.getID(), OpSize);
2937     if (NewOpc == I.getOpcode())
2938       return false;
2939 
2940     I.setDesc(TII.get(NewOpc));
2941     // FIXME: Should the type be always reset in setDesc?
2942 
2943     // Now that we selected an opcode, we need to constrain the register
2944     // operands to use appropriate classes.
2945     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2946   }
2947 
2948   case TargetOpcode::G_PTR_ADD: {
2949     emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIB);
2950     I.eraseFromParent();
2951     return true;
2952   }
2953   case TargetOpcode::G_SADDO:
2954   case TargetOpcode::G_UADDO:
2955   case TargetOpcode::G_SSUBO:
2956   case TargetOpcode::G_USUBO: {
2957     // Emit the operation and get the correct condition code.
2958     auto OpAndCC = emitOverflowOp(Opcode, I.getOperand(0).getReg(),
2959                                   I.getOperand(2), I.getOperand(3), MIB);
2960 
2961     // Now, put the overflow result in the register given by the first operand
2962     // to the overflow op. CSINC increments the result when the predicate is
2963     // false, so to get the increment when it's true, we need to use the
2964     // inverse. In this case, we want to increment when carry is set.
2965     Register ZReg = AArch64::WZR;
2966     auto CsetMI = MIB.buildInstr(AArch64::CSINCWr, {I.getOperand(1).getReg()},
2967                                  {ZReg, ZReg})
2968                       .addImm(getInvertedCondCode(OpAndCC.second));
2969     constrainSelectedInstRegOperands(*CsetMI, TII, TRI, RBI);
2970     I.eraseFromParent();
2971     return true;
2972   }
2973 
2974   case TargetOpcode::G_PTRMASK: {
2975     Register MaskReg = I.getOperand(2).getReg();
2976     Optional<int64_t> MaskVal = getIConstantVRegSExtVal(MaskReg, MRI);
2977     // TODO: Implement arbitrary cases
2978     if (!MaskVal || !isShiftedMask_64(*MaskVal))
2979       return false;
2980 
2981     uint64_t Mask = *MaskVal;
2982     I.setDesc(TII.get(AArch64::ANDXri));
2983     I.getOperand(2).ChangeToImmediate(
2984         AArch64_AM::encodeLogicalImmediate(Mask, 64));
2985 
2986     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
2987   }
2988   case TargetOpcode::G_PTRTOINT:
2989   case TargetOpcode::G_TRUNC: {
2990     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
2991     const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
2992 
2993     const Register DstReg = I.getOperand(0).getReg();
2994     const Register SrcReg = I.getOperand(1).getReg();
2995 
2996     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
2997     const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
2998 
2999     if (DstRB.getID() != SrcRB.getID()) {
3000       LLVM_DEBUG(
3001           dbgs() << "G_TRUNC/G_PTRTOINT input/output on different banks\n");
3002       return false;
3003     }
3004 
3005     if (DstRB.getID() == AArch64::GPRRegBankID) {
3006       const TargetRegisterClass *DstRC =
3007           getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3008       if (!DstRC)
3009         return false;
3010 
3011       const TargetRegisterClass *SrcRC =
3012           getRegClassForTypeOnBank(SrcTy, SrcRB, RBI);
3013       if (!SrcRC)
3014         return false;
3015 
3016       if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
3017           !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
3018         LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC/G_PTRTOINT\n");
3019         return false;
3020       }
3021 
3022       if (DstRC == SrcRC) {
3023         // Nothing to be done
3024       } else if (Opcode == TargetOpcode::G_TRUNC && DstTy == LLT::scalar(32) &&
3025                  SrcTy == LLT::scalar(64)) {
3026         llvm_unreachable("TableGen can import this case");
3027         return false;
3028       } else if (DstRC == &AArch64::GPR32RegClass &&
3029                  SrcRC == &AArch64::GPR64RegClass) {
3030         I.getOperand(1).setSubReg(AArch64::sub_32);
3031       } else {
3032         LLVM_DEBUG(
3033             dbgs() << "Unhandled mismatched classes in G_TRUNC/G_PTRTOINT\n");
3034         return false;
3035       }
3036 
3037       I.setDesc(TII.get(TargetOpcode::COPY));
3038       return true;
3039     } else if (DstRB.getID() == AArch64::FPRRegBankID) {
3040       if (DstTy == LLT::fixed_vector(4, 16) &&
3041           SrcTy == LLT::fixed_vector(4, 32)) {
3042         I.setDesc(TII.get(AArch64::XTNv4i16));
3043         constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3044         return true;
3045       }
3046 
3047       if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128) {
3048         MachineInstr *Extract = emitExtractVectorElt(
3049             DstReg, DstRB, LLT::scalar(DstTy.getSizeInBits()), SrcReg, 0, MIB);
3050         if (!Extract)
3051           return false;
3052         I.eraseFromParent();
3053         return true;
3054       }
3055 
3056       // We might have a vector G_PTRTOINT, in which case just emit a COPY.
3057       if (Opcode == TargetOpcode::G_PTRTOINT) {
3058         assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
3059         I.setDesc(TII.get(TargetOpcode::COPY));
3060         return selectCopy(I, TII, MRI, TRI, RBI);
3061       }
3062     }
3063 
3064     return false;
3065   }
3066 
3067   case TargetOpcode::G_ANYEXT: {
3068     if (selectUSMovFromExtend(I, MRI))
3069       return true;
3070 
3071     const Register DstReg = I.getOperand(0).getReg();
3072     const Register SrcReg = I.getOperand(1).getReg();
3073 
3074     const RegisterBank &RBDst = *RBI.getRegBank(DstReg, MRI, TRI);
3075     if (RBDst.getID() != AArch64::GPRRegBankID) {
3076       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBDst
3077                         << ", expected: GPR\n");
3078       return false;
3079     }
3080 
3081     const RegisterBank &RBSrc = *RBI.getRegBank(SrcReg, MRI, TRI);
3082     if (RBSrc.getID() != AArch64::GPRRegBankID) {
3083       LLVM_DEBUG(dbgs() << "G_ANYEXT on bank: " << RBSrc
3084                         << ", expected: GPR\n");
3085       return false;
3086     }
3087 
3088     const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
3089 
3090     if (DstSize == 0) {
3091       LLVM_DEBUG(dbgs() << "G_ANYEXT operand has no size, not a gvreg?\n");
3092       return false;
3093     }
3094 
3095     if (DstSize != 64 && DstSize > 32) {
3096       LLVM_DEBUG(dbgs() << "G_ANYEXT to size: " << DstSize
3097                         << ", expected: 32 or 64\n");
3098       return false;
3099     }
3100     // At this point G_ANYEXT is just like a plain COPY, but we need
3101     // to explicitly form the 64-bit value if any.
3102     if (DstSize > 32) {
3103       Register ExtSrc = MRI.createVirtualRegister(&AArch64::GPR64allRegClass);
3104       BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::SUBREG_TO_REG))
3105           .addDef(ExtSrc)
3106           .addImm(0)
3107           .addUse(SrcReg)
3108           .addImm(AArch64::sub_32);
3109       I.getOperand(1).setReg(ExtSrc);
3110     }
3111     return selectCopy(I, TII, MRI, TRI, RBI);
3112   }
3113 
3114   case TargetOpcode::G_ZEXT:
3115   case TargetOpcode::G_SEXT_INREG:
3116   case TargetOpcode::G_SEXT: {
3117     if (selectUSMovFromExtend(I, MRI))
3118       return true;
3119 
3120     unsigned Opcode = I.getOpcode();
3121     const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
3122     const Register DefReg = I.getOperand(0).getReg();
3123     Register SrcReg = I.getOperand(1).getReg();
3124     const LLT DstTy = MRI.getType(DefReg);
3125     const LLT SrcTy = MRI.getType(SrcReg);
3126     unsigned DstSize = DstTy.getSizeInBits();
3127     unsigned SrcSize = SrcTy.getSizeInBits();
3128 
3129     // SEXT_INREG has the same src reg size as dst, the size of the value to be
3130     // extended is encoded in the imm.
3131     if (Opcode == TargetOpcode::G_SEXT_INREG)
3132       SrcSize = I.getOperand(2).getImm();
3133 
3134     if (DstTy.isVector())
3135       return false; // Should be handled by imported patterns.
3136 
3137     assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
3138                AArch64::GPRRegBankID &&
3139            "Unexpected ext regbank");
3140 
3141     MachineInstr *ExtI;
3142 
3143     // First check if we're extending the result of a load which has a dest type
3144     // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
3145     // GPR register on AArch64 and all loads which are smaller automatically
3146     // zero-extend the upper bits. E.g.
3147     // %v(s8) = G_LOAD %p, :: (load 1)
3148     // %v2(s32) = G_ZEXT %v(s8)
3149     if (!IsSigned) {
3150       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
3151       bool IsGPR =
3152           RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
3153       if (LoadMI && IsGPR) {
3154         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
3155         unsigned BytesLoaded = MemOp->getSize();
3156         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
3157           return selectCopy(I, TII, MRI, TRI, RBI);
3158       }
3159 
3160       // For the 32-bit -> 64-bit case, we can emit a mov (ORRWrs)
3161       // + SUBREG_TO_REG.
3162       //
3163       // If we are zero extending from 32 bits to 64 bits, it's possible that
3164       // the instruction implicitly does the zero extend for us. In that case,
3165       // we only need the SUBREG_TO_REG.
3166       if (IsGPR && SrcSize == 32 && DstSize == 64) {
3167         // Unlike with the G_LOAD case, we don't want to look through copies
3168         // here. (See isDef32.)
3169         MachineInstr *Def = MRI.getVRegDef(SrcReg);
3170         Register SubregToRegSrc = SrcReg;
3171 
3172         // Does the instruction implicitly zero extend?
3173         if (!Def || !isDef32(*Def)) {
3174           // No. Zero out using an OR.
3175           Register OrDst = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3176           const Register ZReg = AArch64::WZR;
3177           MIB.buildInstr(AArch64::ORRWrs, {OrDst}, {ZReg, SrcReg}).addImm(0);
3178           SubregToRegSrc = OrDst;
3179         }
3180 
3181         MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
3182             .addImm(0)
3183             .addUse(SubregToRegSrc)
3184             .addImm(AArch64::sub_32);
3185 
3186         if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
3187                                           MRI)) {
3188           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
3189           return false;
3190         }
3191 
3192         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3193                                           MRI)) {
3194           LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
3195           return false;
3196         }
3197 
3198         I.eraseFromParent();
3199         return true;
3200       }
3201     }
3202 
3203     if (DstSize == 64) {
3204       if (Opcode != TargetOpcode::G_SEXT_INREG) {
3205         // FIXME: Can we avoid manually doing this?
3206         if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
3207                                           MRI)) {
3208           LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
3209                             << " operand\n");
3210           return false;
3211         }
3212         SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
3213                                 {&AArch64::GPR64RegClass}, {})
3214                      .addImm(0)
3215                      .addUse(SrcReg)
3216                      .addImm(AArch64::sub_32)
3217                      .getReg(0);
3218       }
3219 
3220       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
3221                              {DefReg}, {SrcReg})
3222                   .addImm(0)
3223                   .addImm(SrcSize - 1);
3224     } else if (DstSize <= 32) {
3225       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMWri : AArch64::UBFMWri,
3226                              {DefReg}, {SrcReg})
3227                   .addImm(0)
3228                   .addImm(SrcSize - 1);
3229     } else {
3230       return false;
3231     }
3232 
3233     constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
3234     I.eraseFromParent();
3235     return true;
3236   }
3237 
3238   case TargetOpcode::G_SITOFP:
3239   case TargetOpcode::G_UITOFP:
3240   case TargetOpcode::G_FPTOSI:
3241   case TargetOpcode::G_FPTOUI: {
3242     const LLT DstTy = MRI.getType(I.getOperand(0).getReg()),
3243               SrcTy = MRI.getType(I.getOperand(1).getReg());
3244     const unsigned NewOpc = selectFPConvOpc(Opcode, DstTy, SrcTy);
3245     if (NewOpc == Opcode)
3246       return false;
3247 
3248     I.setDesc(TII.get(NewOpc));
3249     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3250 
3251     return true;
3252   }
3253 
3254   case TargetOpcode::G_FREEZE:
3255     return selectCopy(I, TII, MRI, TRI, RBI);
3256 
3257   case TargetOpcode::G_INTTOPTR:
3258     // The importer is currently unable to import pointer types since they
3259     // didn't exist in SelectionDAG.
3260     return selectCopy(I, TII, MRI, TRI, RBI);
3261 
3262   case TargetOpcode::G_BITCAST:
3263     // Imported SelectionDAG rules can handle every bitcast except those that
3264     // bitcast from a type to the same type. Ideally, these shouldn't occur
3265     // but we might not run an optimizer that deletes them. The other exception
3266     // is bitcasts involving pointer types, as SelectionDAG has no knowledge
3267     // of them.
3268     return selectCopy(I, TII, MRI, TRI, RBI);
3269 
3270   case TargetOpcode::G_SELECT: {
3271     if (MRI.getType(I.getOperand(1).getReg()) != LLT::scalar(1)) {
3272       LLVM_DEBUG(dbgs() << "G_SELECT cond has type: " << Ty
3273                         << ", expected: " << LLT::scalar(1) << '\n');
3274       return false;
3275     }
3276 
3277     const Register CondReg = I.getOperand(1).getReg();
3278     const Register TReg = I.getOperand(2).getReg();
3279     const Register FReg = I.getOperand(3).getReg();
3280 
3281     if (tryOptSelect(I))
3282       return true;
3283 
3284     // Make sure to use an unused vreg instead of wzr, so that the peephole
3285     // optimizations will be able to optimize these.
3286     Register DeadVReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
3287     auto TstMI = MIB.buildInstr(AArch64::ANDSWri, {DeadVReg}, {CondReg})
3288                      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
3289     constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
3290     if (!emitSelect(I.getOperand(0).getReg(), TReg, FReg, AArch64CC::NE, MIB))
3291       return false;
3292     I.eraseFromParent();
3293     return true;
3294   }
3295   case TargetOpcode::G_ICMP: {
3296     if (Ty.isVector())
3297       return selectVectorICmp(I, MRI);
3298 
3299     if (Ty != LLT::scalar(32)) {
3300       LLVM_DEBUG(dbgs() << "G_ICMP result has type: " << Ty
3301                         << ", expected: " << LLT::scalar(32) << '\n');
3302       return false;
3303     }
3304 
3305     auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3306     emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
3307                        MIB);
3308     emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIB);
3309     I.eraseFromParent();
3310     return true;
3311   }
3312 
3313   case TargetOpcode::G_FCMP: {
3314     CmpInst::Predicate Pred =
3315         static_cast<CmpInst::Predicate>(I.getOperand(1).getPredicate());
3316     if (!emitFPCompare(I.getOperand(2).getReg(), I.getOperand(3).getReg(), MIB,
3317                        Pred) ||
3318         !emitCSetForFCmp(I.getOperand(0).getReg(), Pred, MIB))
3319       return false;
3320     I.eraseFromParent();
3321     return true;
3322   }
3323   case TargetOpcode::G_VASTART:
3324     return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
3325                                 : selectVaStartAAPCS(I, MF, MRI);
3326   case TargetOpcode::G_INTRINSIC:
3327     return selectIntrinsic(I, MRI);
3328   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
3329     return selectIntrinsicWithSideEffects(I, MRI);
3330   case TargetOpcode::G_IMPLICIT_DEF: {
3331     I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
3332     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3333     const Register DstReg = I.getOperand(0).getReg();
3334     const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
3335     const TargetRegisterClass *DstRC =
3336         getRegClassForTypeOnBank(DstTy, DstRB, RBI);
3337     RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
3338     return true;
3339   }
3340   case TargetOpcode::G_BLOCK_ADDR: {
3341     if (TM.getCodeModel() == CodeModel::Large) {
3342       materializeLargeCMVal(I, I.getOperand(1).getBlockAddress(), 0);
3343       I.eraseFromParent();
3344       return true;
3345     } else {
3346       I.setDesc(TII.get(AArch64::MOVaddrBA));
3347       auto MovMI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::MOVaddrBA),
3348                            I.getOperand(0).getReg())
3349                        .addBlockAddress(I.getOperand(1).getBlockAddress(),
3350                                         /* Offset */ 0, AArch64II::MO_PAGE)
3351                        .addBlockAddress(
3352                            I.getOperand(1).getBlockAddress(), /* Offset */ 0,
3353                            AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3354       I.eraseFromParent();
3355       return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3356     }
3357   }
3358   case AArch64::G_DUP: {
3359     // When the scalar of G_DUP is an s8/s16 gpr, they can't be selected by
3360     // imported patterns. Do it manually here. Avoiding generating s16 gpr is
3361     // difficult because at RBS we may end up pessimizing the fpr case if we
3362     // decided to add an anyextend to fix this. Manual selection is the most
3363     // robust solution for now.
3364     if (RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
3365         AArch64::GPRRegBankID)
3366       return false; // We expect the fpr regbank case to be imported.
3367     LLT VecTy = MRI.getType(I.getOperand(0).getReg());
3368     if (VecTy == LLT::fixed_vector(8, 8))
3369       I.setDesc(TII.get(AArch64::DUPv8i8gpr));
3370     else if (VecTy == LLT::fixed_vector(16, 8))
3371       I.setDesc(TII.get(AArch64::DUPv16i8gpr));
3372     else if (VecTy == LLT::fixed_vector(4, 16))
3373       I.setDesc(TII.get(AArch64::DUPv4i16gpr));
3374     else if (VecTy == LLT::fixed_vector(8, 16))
3375       I.setDesc(TII.get(AArch64::DUPv8i16gpr));
3376     else
3377       return false;
3378     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3379   }
3380   case TargetOpcode::G_INTRINSIC_TRUNC:
3381     return selectIntrinsicTrunc(I, MRI);
3382   case TargetOpcode::G_INTRINSIC_ROUND:
3383     return selectIntrinsicRound(I, MRI);
3384   case TargetOpcode::G_BUILD_VECTOR:
3385     return selectBuildVector(I, MRI);
3386   case TargetOpcode::G_MERGE_VALUES:
3387     return selectMergeValues(I, MRI);
3388   case TargetOpcode::G_UNMERGE_VALUES:
3389     return selectUnmergeValues(I, MRI);
3390   case TargetOpcode::G_SHUFFLE_VECTOR:
3391     return selectShuffleVector(I, MRI);
3392   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
3393     return selectExtractElt(I, MRI);
3394   case TargetOpcode::G_INSERT_VECTOR_ELT:
3395     return selectInsertElt(I, MRI);
3396   case TargetOpcode::G_CONCAT_VECTORS:
3397     return selectConcatVectors(I, MRI);
3398   case TargetOpcode::G_JUMP_TABLE:
3399     return selectJumpTable(I, MRI);
3400   case TargetOpcode::G_VECREDUCE_FADD:
3401   case TargetOpcode::G_VECREDUCE_ADD:
3402     return selectReduction(I, MRI);
3403   }
3404 
3405   return false;
3406 }
3407 
selectReduction(MachineInstr & I,MachineRegisterInfo & MRI)3408 bool AArch64InstructionSelector::selectReduction(MachineInstr &I,
3409                                                  MachineRegisterInfo &MRI) {
3410   Register VecReg = I.getOperand(1).getReg();
3411   LLT VecTy = MRI.getType(VecReg);
3412   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_ADD) {
3413     // For <2 x i32> ADDPv2i32 generates an FPR64 value, so we need to emit
3414     // a subregister copy afterwards.
3415     if (VecTy == LLT::fixed_vector(2, 32)) {
3416       Register DstReg = I.getOperand(0).getReg();
3417       auto AddP = MIB.buildInstr(AArch64::ADDPv2i32, {&AArch64::FPR64RegClass},
3418                                  {VecReg, VecReg});
3419       auto Copy = MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
3420                       .addReg(AddP.getReg(0), 0, AArch64::ssub)
3421                       .getReg(0);
3422       RBI.constrainGenericRegister(Copy, AArch64::FPR32RegClass, MRI);
3423       I.eraseFromParent();
3424       return constrainSelectedInstRegOperands(*AddP, TII, TRI, RBI);
3425     }
3426 
3427     unsigned Opc = 0;
3428     if (VecTy == LLT::fixed_vector(16, 8))
3429       Opc = AArch64::ADDVv16i8v;
3430     else if (VecTy == LLT::fixed_vector(8, 16))
3431       Opc = AArch64::ADDVv8i16v;
3432     else if (VecTy == LLT::fixed_vector(4, 32))
3433       Opc = AArch64::ADDVv4i32v;
3434     else if (VecTy == LLT::fixed_vector(2, 64))
3435       Opc = AArch64::ADDPv2i64p;
3436     else {
3437       LLVM_DEBUG(dbgs() << "Unhandled type for add reduction");
3438       return false;
3439     }
3440     I.setDesc(TII.get(Opc));
3441     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3442   }
3443 
3444   if (I.getOpcode() == TargetOpcode::G_VECREDUCE_FADD) {
3445     unsigned Opc = 0;
3446     if (VecTy == LLT::fixed_vector(2, 32))
3447       Opc = AArch64::FADDPv2i32p;
3448     else if (VecTy == LLT::fixed_vector(2, 64))
3449       Opc = AArch64::FADDPv2i64p;
3450     else {
3451       LLVM_DEBUG(dbgs() << "Unhandled type for fadd reduction");
3452       return false;
3453     }
3454     I.setDesc(TII.get(Opc));
3455     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3456   }
3457   return false;
3458 }
3459 
selectBrJT(MachineInstr & I,MachineRegisterInfo & MRI)3460 bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
3461                                             MachineRegisterInfo &MRI) {
3462   assert(I.getOpcode() == TargetOpcode::G_BRJT && "Expected G_BRJT");
3463   Register JTAddr = I.getOperand(0).getReg();
3464   unsigned JTI = I.getOperand(1).getIndex();
3465   Register Index = I.getOperand(2).getReg();
3466 
3467   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3468   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
3469 
3470   MF->getInfo<AArch64FunctionInfo>()->setJumpTableEntryInfo(JTI, 4, nullptr);
3471   auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
3472                                       {TargetReg, ScratchReg}, {JTAddr, Index})
3473                            .addJumpTableIndex(JTI);
3474   // Build the indirect branch.
3475   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
3476   I.eraseFromParent();
3477   return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
3478 }
3479 
selectJumpTable(MachineInstr & I,MachineRegisterInfo & MRI)3480 bool AArch64InstructionSelector::selectJumpTable(MachineInstr &I,
3481                                                  MachineRegisterInfo &MRI) {
3482   assert(I.getOpcode() == TargetOpcode::G_JUMP_TABLE && "Expected jump table");
3483   assert(I.getOperand(1).isJTI() && "Jump table op should have a JTI!");
3484 
3485   Register DstReg = I.getOperand(0).getReg();
3486   unsigned JTI = I.getOperand(1).getIndex();
3487   // We generate a MOVaddrJT which will get expanded to an ADRP + ADD later.
3488   auto MovMI =
3489     MIB.buildInstr(AArch64::MOVaddrJT, {DstReg}, {})
3490           .addJumpTableIndex(JTI, AArch64II::MO_PAGE)
3491           .addJumpTableIndex(JTI, AArch64II::MO_NC | AArch64II::MO_PAGEOFF);
3492   I.eraseFromParent();
3493   return constrainSelectedInstRegOperands(*MovMI, TII, TRI, RBI);
3494 }
3495 
selectTLSGlobalValue(MachineInstr & I,MachineRegisterInfo & MRI)3496 bool AArch64InstructionSelector::selectTLSGlobalValue(
3497     MachineInstr &I, MachineRegisterInfo &MRI) {
3498   if (!STI.isTargetMachO())
3499     return false;
3500   MachineFunction &MF = *I.getParent()->getParent();
3501   MF.getFrameInfo().setAdjustsStack(true);
3502 
3503   const auto &GlobalOp = I.getOperand(1);
3504   assert(GlobalOp.getOffset() == 0 &&
3505          "Shouldn't have an offset on TLS globals!");
3506   const GlobalValue &GV = *GlobalOp.getGlobal();
3507 
3508   auto LoadGOT =
3509       MIB.buildInstr(AArch64::LOADgot, {&AArch64::GPR64commonRegClass}, {})
3510           .addGlobalAddress(&GV, 0, AArch64II::MO_TLS);
3511 
3512   auto Load = MIB.buildInstr(AArch64::LDRXui, {&AArch64::GPR64commonRegClass},
3513                              {LoadGOT.getReg(0)})
3514                   .addImm(0);
3515 
3516   MIB.buildCopy(Register(AArch64::X0), LoadGOT.getReg(0));
3517   // TLS calls preserve all registers except those that absolutely must be
3518   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
3519   // silly).
3520   MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
3521       .addUse(AArch64::X0, RegState::Implicit)
3522       .addDef(AArch64::X0, RegState::Implicit)
3523       .addRegMask(TRI.getTLSCallPreservedMask());
3524 
3525   MIB.buildCopy(I.getOperand(0).getReg(), Register(AArch64::X0));
3526   RBI.constrainGenericRegister(I.getOperand(0).getReg(), AArch64::GPR64RegClass,
3527                                MRI);
3528   I.eraseFromParent();
3529   return true;
3530 }
3531 
selectIntrinsicTrunc(MachineInstr & I,MachineRegisterInfo & MRI) const3532 bool AArch64InstructionSelector::selectIntrinsicTrunc(
3533     MachineInstr &I, MachineRegisterInfo &MRI) const {
3534   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3535 
3536   // Select the correct opcode.
3537   unsigned Opc = 0;
3538   if (!SrcTy.isVector()) {
3539     switch (SrcTy.getSizeInBits()) {
3540     default:
3541     case 16:
3542       Opc = AArch64::FRINTZHr;
3543       break;
3544     case 32:
3545       Opc = AArch64::FRINTZSr;
3546       break;
3547     case 64:
3548       Opc = AArch64::FRINTZDr;
3549       break;
3550     }
3551   } else {
3552     unsigned NumElts = SrcTy.getNumElements();
3553     switch (SrcTy.getElementType().getSizeInBits()) {
3554     default:
3555       break;
3556     case 16:
3557       if (NumElts == 4)
3558         Opc = AArch64::FRINTZv4f16;
3559       else if (NumElts == 8)
3560         Opc = AArch64::FRINTZv8f16;
3561       break;
3562     case 32:
3563       if (NumElts == 2)
3564         Opc = AArch64::FRINTZv2f32;
3565       else if (NumElts == 4)
3566         Opc = AArch64::FRINTZv4f32;
3567       break;
3568     case 64:
3569       if (NumElts == 2)
3570         Opc = AArch64::FRINTZv2f64;
3571       break;
3572     }
3573   }
3574 
3575   if (!Opc) {
3576     // Didn't get an opcode above, bail.
3577     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_TRUNC!\n");
3578     return false;
3579   }
3580 
3581   // Legalization would have set us up perfectly for this; we just need to
3582   // set the opcode and move on.
3583   I.setDesc(TII.get(Opc));
3584   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3585 }
3586 
selectIntrinsicRound(MachineInstr & I,MachineRegisterInfo & MRI) const3587 bool AArch64InstructionSelector::selectIntrinsicRound(
3588     MachineInstr &I, MachineRegisterInfo &MRI) const {
3589   const LLT SrcTy = MRI.getType(I.getOperand(0).getReg());
3590 
3591   // Select the correct opcode.
3592   unsigned Opc = 0;
3593   if (!SrcTy.isVector()) {
3594     switch (SrcTy.getSizeInBits()) {
3595     default:
3596     case 16:
3597       Opc = AArch64::FRINTAHr;
3598       break;
3599     case 32:
3600       Opc = AArch64::FRINTASr;
3601       break;
3602     case 64:
3603       Opc = AArch64::FRINTADr;
3604       break;
3605     }
3606   } else {
3607     unsigned NumElts = SrcTy.getNumElements();
3608     switch (SrcTy.getElementType().getSizeInBits()) {
3609     default:
3610       break;
3611     case 16:
3612       if (NumElts == 4)
3613         Opc = AArch64::FRINTAv4f16;
3614       else if (NumElts == 8)
3615         Opc = AArch64::FRINTAv8f16;
3616       break;
3617     case 32:
3618       if (NumElts == 2)
3619         Opc = AArch64::FRINTAv2f32;
3620       else if (NumElts == 4)
3621         Opc = AArch64::FRINTAv4f32;
3622       break;
3623     case 64:
3624       if (NumElts == 2)
3625         Opc = AArch64::FRINTAv2f64;
3626       break;
3627     }
3628   }
3629 
3630   if (!Opc) {
3631     // Didn't get an opcode above, bail.
3632     LLVM_DEBUG(dbgs() << "Unsupported type for G_INTRINSIC_ROUND!\n");
3633     return false;
3634   }
3635 
3636   // Legalization would have set us up perfectly for this; we just need to
3637   // set the opcode and move on.
3638   I.setDesc(TII.get(Opc));
3639   return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
3640 }
3641 
selectVectorICmp(MachineInstr & I,MachineRegisterInfo & MRI)3642 bool AArch64InstructionSelector::selectVectorICmp(
3643     MachineInstr &I, MachineRegisterInfo &MRI) {
3644   Register DstReg = I.getOperand(0).getReg();
3645   LLT DstTy = MRI.getType(DstReg);
3646   Register SrcReg = I.getOperand(2).getReg();
3647   Register Src2Reg = I.getOperand(3).getReg();
3648   LLT SrcTy = MRI.getType(SrcReg);
3649 
3650   unsigned SrcEltSize = SrcTy.getElementType().getSizeInBits();
3651   unsigned NumElts = DstTy.getNumElements();
3652 
3653   // First index is element size, 0 == 8b, 1 == 16b, 2 == 32b, 3 == 64b
3654   // Second index is num elts, 0 == v2, 1 == v4, 2 == v8, 3 == v16
3655   // Third index is cc opcode:
3656   // 0 == eq
3657   // 1 == ugt
3658   // 2 == uge
3659   // 3 == ult
3660   // 4 == ule
3661   // 5 == sgt
3662   // 6 == sge
3663   // 7 == slt
3664   // 8 == sle
3665   // ne is done by negating 'eq' result.
3666 
3667   // This table below assumes that for some comparisons the operands will be
3668   // commuted.
3669   // ult op == commute + ugt op
3670   // ule op == commute + uge op
3671   // slt op == commute + sgt op
3672   // sle op == commute + sge op
3673   unsigned PredIdx = 0;
3674   bool SwapOperands = false;
3675   CmpInst::Predicate Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
3676   switch (Pred) {
3677   case CmpInst::ICMP_NE:
3678   case CmpInst::ICMP_EQ:
3679     PredIdx = 0;
3680     break;
3681   case CmpInst::ICMP_UGT:
3682     PredIdx = 1;
3683     break;
3684   case CmpInst::ICMP_UGE:
3685     PredIdx = 2;
3686     break;
3687   case CmpInst::ICMP_ULT:
3688     PredIdx = 3;
3689     SwapOperands = true;
3690     break;
3691   case CmpInst::ICMP_ULE:
3692     PredIdx = 4;
3693     SwapOperands = true;
3694     break;
3695   case CmpInst::ICMP_SGT:
3696     PredIdx = 5;
3697     break;
3698   case CmpInst::ICMP_SGE:
3699     PredIdx = 6;
3700     break;
3701   case CmpInst::ICMP_SLT:
3702     PredIdx = 7;
3703     SwapOperands = true;
3704     break;
3705   case CmpInst::ICMP_SLE:
3706     PredIdx = 8;
3707     SwapOperands = true;
3708     break;
3709   default:
3710     llvm_unreachable("Unhandled icmp predicate");
3711     return false;
3712   }
3713 
3714   // This table obviously should be tablegen'd when we have our GISel native
3715   // tablegen selector.
3716 
3717   static const unsigned OpcTable[4][4][9] = {
3718       {
3719           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3720            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3721            0 /* invalid */},
3722           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3723            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3724            0 /* invalid */},
3725           {AArch64::CMEQv8i8, AArch64::CMHIv8i8, AArch64::CMHSv8i8,
3726            AArch64::CMHIv8i8, AArch64::CMHSv8i8, AArch64::CMGTv8i8,
3727            AArch64::CMGEv8i8, AArch64::CMGTv8i8, AArch64::CMGEv8i8},
3728           {AArch64::CMEQv16i8, AArch64::CMHIv16i8, AArch64::CMHSv16i8,
3729            AArch64::CMHIv16i8, AArch64::CMHSv16i8, AArch64::CMGTv16i8,
3730            AArch64::CMGEv16i8, AArch64::CMGTv16i8, AArch64::CMGEv16i8}
3731       },
3732       {
3733           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3734            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3735            0 /* invalid */},
3736           {AArch64::CMEQv4i16, AArch64::CMHIv4i16, AArch64::CMHSv4i16,
3737            AArch64::CMHIv4i16, AArch64::CMHSv4i16, AArch64::CMGTv4i16,
3738            AArch64::CMGEv4i16, AArch64::CMGTv4i16, AArch64::CMGEv4i16},
3739           {AArch64::CMEQv8i16, AArch64::CMHIv8i16, AArch64::CMHSv8i16,
3740            AArch64::CMHIv8i16, AArch64::CMHSv8i16, AArch64::CMGTv8i16,
3741            AArch64::CMGEv8i16, AArch64::CMGTv8i16, AArch64::CMGEv8i16},
3742           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3743            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3744            0 /* invalid */}
3745       },
3746       {
3747           {AArch64::CMEQv2i32, AArch64::CMHIv2i32, AArch64::CMHSv2i32,
3748            AArch64::CMHIv2i32, AArch64::CMHSv2i32, AArch64::CMGTv2i32,
3749            AArch64::CMGEv2i32, AArch64::CMGTv2i32, AArch64::CMGEv2i32},
3750           {AArch64::CMEQv4i32, AArch64::CMHIv4i32, AArch64::CMHSv4i32,
3751            AArch64::CMHIv4i32, AArch64::CMHSv4i32, AArch64::CMGTv4i32,
3752            AArch64::CMGEv4i32, AArch64::CMGTv4i32, AArch64::CMGEv4i32},
3753           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3754            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3755            0 /* invalid */},
3756           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3757            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3758            0 /* invalid */}
3759       },
3760       {
3761           {AArch64::CMEQv2i64, AArch64::CMHIv2i64, AArch64::CMHSv2i64,
3762            AArch64::CMHIv2i64, AArch64::CMHSv2i64, AArch64::CMGTv2i64,
3763            AArch64::CMGEv2i64, AArch64::CMGTv2i64, AArch64::CMGEv2i64},
3764           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3765            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3766            0 /* invalid */},
3767           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3768            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3769            0 /* invalid */},
3770           {0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3771            0 /* invalid */, 0 /* invalid */, 0 /* invalid */, 0 /* invalid */,
3772            0 /* invalid */}
3773       },
3774   };
3775   unsigned EltIdx = Log2_32(SrcEltSize / 8);
3776   unsigned NumEltsIdx = Log2_32(NumElts / 2);
3777   unsigned Opc = OpcTable[EltIdx][NumEltsIdx][PredIdx];
3778   if (!Opc) {
3779     LLVM_DEBUG(dbgs() << "Could not map G_ICMP to cmp opcode");
3780     return false;
3781   }
3782 
3783   const RegisterBank &VecRB = *RBI.getRegBank(SrcReg, MRI, TRI);
3784   const TargetRegisterClass *SrcRC =
3785       getRegClassForTypeOnBank(SrcTy, VecRB, RBI, true);
3786   if (!SrcRC) {
3787     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3788     return false;
3789   }
3790 
3791   unsigned NotOpc = Pred == ICmpInst::ICMP_NE ? AArch64::NOTv8i8 : 0;
3792   if (SrcTy.getSizeInBits() == 128)
3793     NotOpc = NotOpc ? AArch64::NOTv16i8 : 0;
3794 
3795   if (SwapOperands)
3796     std::swap(SrcReg, Src2Reg);
3797 
3798   auto Cmp = MIB.buildInstr(Opc, {SrcRC}, {SrcReg, Src2Reg});
3799   constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3800 
3801   // Invert if we had a 'ne' cc.
3802   if (NotOpc) {
3803     Cmp = MIB.buildInstr(NotOpc, {DstReg}, {Cmp});
3804     constrainSelectedInstRegOperands(*Cmp, TII, TRI, RBI);
3805   } else {
3806     MIB.buildCopy(DstReg, Cmp.getReg(0));
3807   }
3808   RBI.constrainGenericRegister(DstReg, *SrcRC, MRI);
3809   I.eraseFromParent();
3810   return true;
3811 }
3812 
emitScalarToVector(unsigned EltSize,const TargetRegisterClass * DstRC,Register Scalar,MachineIRBuilder & MIRBuilder) const3813 MachineInstr *AArch64InstructionSelector::emitScalarToVector(
3814     unsigned EltSize, const TargetRegisterClass *DstRC, Register Scalar,
3815     MachineIRBuilder &MIRBuilder) const {
3816   auto Undef = MIRBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstRC}, {});
3817 
3818   auto BuildFn = [&](unsigned SubregIndex) {
3819     auto Ins =
3820         MIRBuilder
3821             .buildInstr(TargetOpcode::INSERT_SUBREG, {DstRC}, {Undef, Scalar})
3822             .addImm(SubregIndex);
3823     constrainSelectedInstRegOperands(*Undef, TII, TRI, RBI);
3824     constrainSelectedInstRegOperands(*Ins, TII, TRI, RBI);
3825     return &*Ins;
3826   };
3827 
3828   switch (EltSize) {
3829   case 16:
3830     return BuildFn(AArch64::hsub);
3831   case 32:
3832     return BuildFn(AArch64::ssub);
3833   case 64:
3834     return BuildFn(AArch64::dsub);
3835   default:
3836     return nullptr;
3837   }
3838 }
3839 
selectMergeValues(MachineInstr & I,MachineRegisterInfo & MRI)3840 bool AArch64InstructionSelector::selectMergeValues(
3841     MachineInstr &I, MachineRegisterInfo &MRI) {
3842   assert(I.getOpcode() == TargetOpcode::G_MERGE_VALUES && "unexpected opcode");
3843   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
3844   const LLT SrcTy = MRI.getType(I.getOperand(1).getReg());
3845   assert(!DstTy.isVector() && !SrcTy.isVector() && "invalid merge operation");
3846   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
3847 
3848   if (I.getNumOperands() != 3)
3849     return false;
3850 
3851   // Merging 2 s64s into an s128.
3852   if (DstTy == LLT::scalar(128)) {
3853     if (SrcTy.getSizeInBits() != 64)
3854       return false;
3855     Register DstReg = I.getOperand(0).getReg();
3856     Register Src1Reg = I.getOperand(1).getReg();
3857     Register Src2Reg = I.getOperand(2).getReg();
3858     auto Tmp = MIB.buildInstr(TargetOpcode::IMPLICIT_DEF, {DstTy}, {});
3859     MachineInstr *InsMI =
3860         emitLaneInsert(None, Tmp.getReg(0), Src1Reg, /* LaneIdx */ 0, RB, MIB);
3861     if (!InsMI)
3862       return false;
3863     MachineInstr *Ins2MI = emitLaneInsert(DstReg, InsMI->getOperand(0).getReg(),
3864                                           Src2Reg, /* LaneIdx */ 1, RB, MIB);
3865     if (!Ins2MI)
3866       return false;
3867     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
3868     constrainSelectedInstRegOperands(*Ins2MI, TII, TRI, RBI);
3869     I.eraseFromParent();
3870     return true;
3871   }
3872 
3873   if (RB.getID() != AArch64::GPRRegBankID)
3874     return false;
3875 
3876   if (DstTy.getSizeInBits() != 64 || SrcTy.getSizeInBits() != 32)
3877     return false;
3878 
3879   auto *DstRC = &AArch64::GPR64RegClass;
3880   Register SubToRegDef = MRI.createVirtualRegister(DstRC);
3881   MachineInstr &SubRegMI = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3882                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3883                                 .addDef(SubToRegDef)
3884                                 .addImm(0)
3885                                 .addUse(I.getOperand(1).getReg())
3886                                 .addImm(AArch64::sub_32);
3887   Register SubToRegDef2 = MRI.createVirtualRegister(DstRC);
3888   // Need to anyext the second scalar before we can use bfm
3889   MachineInstr &SubRegMI2 = *BuildMI(*I.getParent(), I, I.getDebugLoc(),
3890                                     TII.get(TargetOpcode::SUBREG_TO_REG))
3891                                 .addDef(SubToRegDef2)
3892                                 .addImm(0)
3893                                 .addUse(I.getOperand(2).getReg())
3894                                 .addImm(AArch64::sub_32);
3895   MachineInstr &BFM =
3896       *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::BFMXri))
3897            .addDef(I.getOperand(0).getReg())
3898            .addUse(SubToRegDef)
3899            .addUse(SubToRegDef2)
3900            .addImm(32)
3901            .addImm(31);
3902   constrainSelectedInstRegOperands(SubRegMI, TII, TRI, RBI);
3903   constrainSelectedInstRegOperands(SubRegMI2, TII, TRI, RBI);
3904   constrainSelectedInstRegOperands(BFM, TII, TRI, RBI);
3905   I.eraseFromParent();
3906   return true;
3907 }
3908 
getLaneCopyOpcode(unsigned & CopyOpc,unsigned & ExtractSubReg,const unsigned EltSize)3909 static bool getLaneCopyOpcode(unsigned &CopyOpc, unsigned &ExtractSubReg,
3910                               const unsigned EltSize) {
3911   // Choose a lane copy opcode and subregister based off of the size of the
3912   // vector's elements.
3913   switch (EltSize) {
3914   case 8:
3915     CopyOpc = AArch64::CPYi8;
3916     ExtractSubReg = AArch64::bsub;
3917     break;
3918   case 16:
3919     CopyOpc = AArch64::CPYi16;
3920     ExtractSubReg = AArch64::hsub;
3921     break;
3922   case 32:
3923     CopyOpc = AArch64::CPYi32;
3924     ExtractSubReg = AArch64::ssub;
3925     break;
3926   case 64:
3927     CopyOpc = AArch64::CPYi64;
3928     ExtractSubReg = AArch64::dsub;
3929     break;
3930   default:
3931     // Unknown size, bail out.
3932     LLVM_DEBUG(dbgs() << "Elt size '" << EltSize << "' unsupported.\n");
3933     return false;
3934   }
3935   return true;
3936 }
3937 
emitExtractVectorElt(Optional<Register> DstReg,const RegisterBank & DstRB,LLT ScalarTy,Register VecReg,unsigned LaneIdx,MachineIRBuilder & MIRBuilder) const3938 MachineInstr *AArch64InstructionSelector::emitExtractVectorElt(
3939     Optional<Register> DstReg, const RegisterBank &DstRB, LLT ScalarTy,
3940     Register VecReg, unsigned LaneIdx, MachineIRBuilder &MIRBuilder) const {
3941   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
3942   unsigned CopyOpc = 0;
3943   unsigned ExtractSubReg = 0;
3944   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, ScalarTy.getSizeInBits())) {
3945     LLVM_DEBUG(
3946         dbgs() << "Couldn't determine lane copy opcode for instruction.\n");
3947     return nullptr;
3948   }
3949 
3950   const TargetRegisterClass *DstRC =
3951       getRegClassForTypeOnBank(ScalarTy, DstRB, RBI, true);
3952   if (!DstRC) {
3953     LLVM_DEBUG(dbgs() << "Could not determine destination register class.\n");
3954     return nullptr;
3955   }
3956 
3957   const RegisterBank &VecRB = *RBI.getRegBank(VecReg, MRI, TRI);
3958   const LLT &VecTy = MRI.getType(VecReg);
3959   const TargetRegisterClass *VecRC =
3960       getRegClassForTypeOnBank(VecTy, VecRB, RBI, true);
3961   if (!VecRC) {
3962     LLVM_DEBUG(dbgs() << "Could not determine source register class.\n");
3963     return nullptr;
3964   }
3965 
3966   // The register that we're going to copy into.
3967   Register InsertReg = VecReg;
3968   if (!DstReg)
3969     DstReg = MRI.createVirtualRegister(DstRC);
3970   // If the lane index is 0, we just use a subregister COPY.
3971   if (LaneIdx == 0) {
3972     auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {*DstReg}, {})
3973                     .addReg(VecReg, 0, ExtractSubReg);
3974     RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3975     return &*Copy;
3976   }
3977 
3978   // Lane copies require 128-bit wide registers. If we're dealing with an
3979   // unpacked vector, then we need to move up to that width. Insert an implicit
3980   // def and a subregister insert to get us there.
3981   if (VecTy.getSizeInBits() != 128) {
3982     MachineInstr *ScalarToVector = emitScalarToVector(
3983         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, VecReg, MIRBuilder);
3984     if (!ScalarToVector)
3985       return nullptr;
3986     InsertReg = ScalarToVector->getOperand(0).getReg();
3987   }
3988 
3989   MachineInstr *LaneCopyMI =
3990       MIRBuilder.buildInstr(CopyOpc, {*DstReg}, {InsertReg}).addImm(LaneIdx);
3991   constrainSelectedInstRegOperands(*LaneCopyMI, TII, TRI, RBI);
3992 
3993   // Make sure that we actually constrain the initial copy.
3994   RBI.constrainGenericRegister(*DstReg, *DstRC, MRI);
3995   return LaneCopyMI;
3996 }
3997 
selectExtractElt(MachineInstr & I,MachineRegisterInfo & MRI)3998 bool AArch64InstructionSelector::selectExtractElt(
3999     MachineInstr &I, MachineRegisterInfo &MRI) {
4000   assert(I.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT &&
4001          "unexpected opcode!");
4002   Register DstReg = I.getOperand(0).getReg();
4003   const LLT NarrowTy = MRI.getType(DstReg);
4004   const Register SrcReg = I.getOperand(1).getReg();
4005   const LLT WideTy = MRI.getType(SrcReg);
4006   (void)WideTy;
4007   assert(WideTy.getSizeInBits() >= NarrowTy.getSizeInBits() &&
4008          "source register size too small!");
4009   assert(!NarrowTy.isVector() && "cannot extract vector into vector!");
4010 
4011   // Need the lane index to determine the correct copy opcode.
4012   MachineOperand &LaneIdxOp = I.getOperand(2);
4013   assert(LaneIdxOp.isReg() && "Lane index operand was not a register?");
4014 
4015   if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
4016     LLVM_DEBUG(dbgs() << "Cannot extract into GPR.\n");
4017     return false;
4018   }
4019 
4020   // Find the index to extract from.
4021   auto VRegAndVal = getIConstantVRegValWithLookThrough(LaneIdxOp.getReg(), MRI);
4022   if (!VRegAndVal)
4023     return false;
4024   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4025 
4026 
4027   const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
4028   MachineInstr *Extract = emitExtractVectorElt(DstReg, DstRB, NarrowTy, SrcReg,
4029                                                LaneIdx, MIB);
4030   if (!Extract)
4031     return false;
4032 
4033   I.eraseFromParent();
4034   return true;
4035 }
4036 
selectSplitVectorUnmerge(MachineInstr & I,MachineRegisterInfo & MRI)4037 bool AArch64InstructionSelector::selectSplitVectorUnmerge(
4038     MachineInstr &I, MachineRegisterInfo &MRI) {
4039   unsigned NumElts = I.getNumOperands() - 1;
4040   Register SrcReg = I.getOperand(NumElts).getReg();
4041   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4042   const LLT SrcTy = MRI.getType(SrcReg);
4043 
4044   assert(NarrowTy.isVector() && "Expected an unmerge into vectors");
4045   if (SrcTy.getSizeInBits() > 128) {
4046     LLVM_DEBUG(dbgs() << "Unexpected vector type for vec split unmerge");
4047     return false;
4048   }
4049 
4050   // We implement a split vector operation by treating the sub-vectors as
4051   // scalars and extracting them.
4052   const RegisterBank &DstRB =
4053       *RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI);
4054   for (unsigned OpIdx = 0; OpIdx < NumElts; ++OpIdx) {
4055     Register Dst = I.getOperand(OpIdx).getReg();
4056     MachineInstr *Extract =
4057         emitExtractVectorElt(Dst, DstRB, NarrowTy, SrcReg, OpIdx, MIB);
4058     if (!Extract)
4059       return false;
4060   }
4061   I.eraseFromParent();
4062   return true;
4063 }
4064 
selectUnmergeValues(MachineInstr & I,MachineRegisterInfo & MRI)4065 bool AArch64InstructionSelector::selectUnmergeValues(MachineInstr &I,
4066                                                      MachineRegisterInfo &MRI) {
4067   assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
4068          "unexpected opcode");
4069 
4070   // TODO: Handle unmerging into GPRs and from scalars to scalars.
4071   if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
4072           AArch64::FPRRegBankID ||
4073       RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
4074           AArch64::FPRRegBankID) {
4075     LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
4076                          "currently unsupported.\n");
4077     return false;
4078   }
4079 
4080   // The last operand is the vector source register, and every other operand is
4081   // a register to unpack into.
4082   unsigned NumElts = I.getNumOperands() - 1;
4083   Register SrcReg = I.getOperand(NumElts).getReg();
4084   const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
4085   const LLT WideTy = MRI.getType(SrcReg);
4086   (void)WideTy;
4087   assert((WideTy.isVector() || WideTy.getSizeInBits() == 128) &&
4088          "can only unmerge from vector or s128 types!");
4089   assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
4090          "source register size too small!");
4091 
4092   if (!NarrowTy.isScalar())
4093     return selectSplitVectorUnmerge(I, MRI);
4094 
4095   // Choose a lane copy opcode and subregister based off of the size of the
4096   // vector's elements.
4097   unsigned CopyOpc = 0;
4098   unsigned ExtractSubReg = 0;
4099   if (!getLaneCopyOpcode(CopyOpc, ExtractSubReg, NarrowTy.getSizeInBits()))
4100     return false;
4101 
4102   // Set up for the lane copies.
4103   MachineBasicBlock &MBB = *I.getParent();
4104 
4105   // Stores the registers we'll be copying from.
4106   SmallVector<Register, 4> InsertRegs;
4107 
4108   // We'll use the first register twice, so we only need NumElts-1 registers.
4109   unsigned NumInsertRegs = NumElts - 1;
4110 
4111   // If our elements fit into exactly 128 bits, then we can copy from the source
4112   // directly. Otherwise, we need to do a bit of setup with some subregister
4113   // inserts.
4114   if (NarrowTy.getSizeInBits() * NumElts == 128) {
4115     InsertRegs = SmallVector<Register, 4>(NumInsertRegs, SrcReg);
4116   } else {
4117     // No. We have to perform subregister inserts. For each insert, create an
4118     // implicit def and a subregister insert, and save the register we create.
4119     const TargetRegisterClass *RC =
4120         getMinClassForRegBank(*RBI.getRegBank(SrcReg, MRI, TRI),
4121                               WideTy.getScalarSizeInBits() * NumElts);
4122     unsigned SubReg = 0;
4123     bool Found = getSubRegForClass(RC, TRI, SubReg);
4124     (void)Found;
4125     assert(Found && "expected to find last operand's subeg idx");
4126     for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
4127       Register ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4128       MachineInstr &ImpDefMI =
4129           *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
4130                    ImpDefReg);
4131 
4132       // Now, create the subregister insert from SrcReg.
4133       Register InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
4134       MachineInstr &InsMI =
4135           *BuildMI(MBB, I, I.getDebugLoc(),
4136                    TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
4137                .addUse(ImpDefReg)
4138                .addUse(SrcReg)
4139                .addImm(SubReg);
4140 
4141       constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
4142       constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
4143 
4144       // Save the register so that we can copy from it after.
4145       InsertRegs.push_back(InsertReg);
4146     }
4147   }
4148 
4149   // Now that we've created any necessary subregister inserts, we can
4150   // create the copies.
4151   //
4152   // Perform the first copy separately as a subregister copy.
4153   Register CopyTo = I.getOperand(0).getReg();
4154   auto FirstCopy = MIB.buildInstr(TargetOpcode::COPY, {CopyTo}, {})
4155                        .addReg(InsertRegs[0], 0, ExtractSubReg);
4156   constrainSelectedInstRegOperands(*FirstCopy, TII, TRI, RBI);
4157 
4158   // Now, perform the remaining copies as vector lane copies.
4159   unsigned LaneIdx = 1;
4160   for (Register InsReg : InsertRegs) {
4161     Register CopyTo = I.getOperand(LaneIdx).getReg();
4162     MachineInstr &CopyInst =
4163         *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
4164              .addUse(InsReg)
4165              .addImm(LaneIdx);
4166     constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
4167     ++LaneIdx;
4168   }
4169 
4170   // Separately constrain the first copy's destination. Because of the
4171   // limitation in constrainOperandRegClass, we can't guarantee that this will
4172   // actually be constrained. So, do it ourselves using the second operand.
4173   const TargetRegisterClass *RC =
4174       MRI.getRegClassOrNull(I.getOperand(1).getReg());
4175   if (!RC) {
4176     LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
4177     return false;
4178   }
4179 
4180   RBI.constrainGenericRegister(CopyTo, *RC, MRI);
4181   I.eraseFromParent();
4182   return true;
4183 }
4184 
selectConcatVectors(MachineInstr & I,MachineRegisterInfo & MRI)4185 bool AArch64InstructionSelector::selectConcatVectors(
4186     MachineInstr &I, MachineRegisterInfo &MRI)  {
4187   assert(I.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
4188          "Unexpected opcode");
4189   Register Dst = I.getOperand(0).getReg();
4190   Register Op1 = I.getOperand(1).getReg();
4191   Register Op2 = I.getOperand(2).getReg();
4192   MachineInstr *ConcatMI = emitVectorConcat(Dst, Op1, Op2, MIB);
4193   if (!ConcatMI)
4194     return false;
4195   I.eraseFromParent();
4196   return true;
4197 }
4198 
4199 unsigned
emitConstantPoolEntry(const Constant * CPVal,MachineFunction & MF) const4200 AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
4201                                                   MachineFunction &MF) const {
4202   Type *CPTy = CPVal->getType();
4203   Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
4204 
4205   MachineConstantPool *MCP = MF.getConstantPool();
4206   return MCP->getConstantPoolIndex(CPVal, Alignment);
4207 }
4208 
emitLoadFromConstantPool(const Constant * CPVal,MachineIRBuilder & MIRBuilder) const4209 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
4210     const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
4211   auto &MF = MIRBuilder.getMF();
4212   unsigned CPIdx = emitConstantPoolEntry(CPVal, MF);
4213 
4214   auto Adrp =
4215       MIRBuilder.buildInstr(AArch64::ADRP, {&AArch64::GPR64RegClass}, {})
4216           .addConstantPoolIndex(CPIdx, 0, AArch64II::MO_PAGE);
4217 
4218   MachineInstr *LoadMI = nullptr;
4219   MachinePointerInfo PtrInfo = MachinePointerInfo::getConstantPool(MF);
4220   unsigned Size = MIRBuilder.getDataLayout().getTypeStoreSize(CPVal->getType());
4221   switch (Size) {
4222   case 16:
4223     LoadMI =
4224         &*MIRBuilder
4225               .buildInstr(AArch64::LDRQui, {&AArch64::FPR128RegClass}, {Adrp})
4226               .addConstantPoolIndex(CPIdx, 0,
4227                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4228     break;
4229   case 8:
4230     LoadMI =
4231         &*MIRBuilder
4232               .buildInstr(AArch64::LDRDui, {&AArch64::FPR64RegClass}, {Adrp})
4233               .addConstantPoolIndex(CPIdx, 0,
4234                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4235     break;
4236   case 4:
4237     LoadMI =
4238         &*MIRBuilder
4239               .buildInstr(AArch64::LDRSui, {&AArch64::FPR32RegClass}, {Adrp})
4240               .addConstantPoolIndex(CPIdx, 0,
4241                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4242     break;
4243   case 2:
4244     LoadMI =
4245         &*MIRBuilder
4246               .buildInstr(AArch64::LDRHui, {&AArch64::FPR16RegClass}, {Adrp})
4247               .addConstantPoolIndex(CPIdx, 0,
4248                                     AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
4249     break;
4250   default:
4251     LLVM_DEBUG(dbgs() << "Could not load from constant pool of type "
4252                       << *CPVal->getType());
4253     return nullptr;
4254   }
4255   LoadMI->addMemOperand(MF, MF.getMachineMemOperand(PtrInfo,
4256                                                     MachineMemOperand::MOLoad,
4257                                                     Size, Align(Size)));
4258   constrainSelectedInstRegOperands(*Adrp, TII, TRI, RBI);
4259   constrainSelectedInstRegOperands(*LoadMI, TII, TRI, RBI);
4260   return LoadMI;
4261 }
4262 
4263 /// Return an <Opcode, SubregIndex> pair to do an vector elt insert of a given
4264 /// size and RB.
4265 static std::pair<unsigned, unsigned>
getInsertVecEltOpInfo(const RegisterBank & RB,unsigned EltSize)4266 getInsertVecEltOpInfo(const RegisterBank &RB, unsigned EltSize) {
4267   unsigned Opc, SubregIdx;
4268   if (RB.getID() == AArch64::GPRRegBankID) {
4269     if (EltSize == 16) {
4270       Opc = AArch64::INSvi16gpr;
4271       SubregIdx = AArch64::ssub;
4272     } else if (EltSize == 32) {
4273       Opc = AArch64::INSvi32gpr;
4274       SubregIdx = AArch64::ssub;
4275     } else if (EltSize == 64) {
4276       Opc = AArch64::INSvi64gpr;
4277       SubregIdx = AArch64::dsub;
4278     } else {
4279       llvm_unreachable("invalid elt size!");
4280     }
4281   } else {
4282     if (EltSize == 8) {
4283       Opc = AArch64::INSvi8lane;
4284       SubregIdx = AArch64::bsub;
4285     } else if (EltSize == 16) {
4286       Opc = AArch64::INSvi16lane;
4287       SubregIdx = AArch64::hsub;
4288     } else if (EltSize == 32) {
4289       Opc = AArch64::INSvi32lane;
4290       SubregIdx = AArch64::ssub;
4291     } else if (EltSize == 64) {
4292       Opc = AArch64::INSvi64lane;
4293       SubregIdx = AArch64::dsub;
4294     } else {
4295       llvm_unreachable("invalid elt size!");
4296     }
4297   }
4298   return std::make_pair(Opc, SubregIdx);
4299 }
4300 
emitInstr(unsigned Opcode,std::initializer_list<llvm::DstOp> DstOps,std::initializer_list<llvm::SrcOp> SrcOps,MachineIRBuilder & MIRBuilder,const ComplexRendererFns & RenderFns) const4301 MachineInstr *AArch64InstructionSelector::emitInstr(
4302     unsigned Opcode, std::initializer_list<llvm::DstOp> DstOps,
4303     std::initializer_list<llvm::SrcOp> SrcOps, MachineIRBuilder &MIRBuilder,
4304     const ComplexRendererFns &RenderFns) const {
4305   assert(Opcode && "Expected an opcode?");
4306   assert(!isPreISelGenericOpcode(Opcode) &&
4307          "Function should only be used to produce selected instructions!");
4308   auto MI = MIRBuilder.buildInstr(Opcode, DstOps, SrcOps);
4309   if (RenderFns)
4310     for (auto &Fn : *RenderFns)
4311       Fn(MI);
4312   constrainSelectedInstRegOperands(*MI, TII, TRI, RBI);
4313   return &*MI;
4314 }
4315 
emitAddSub(const std::array<std::array<unsigned,2>,5> & AddrModeAndSizeToOpcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4316 MachineInstr *AArch64InstructionSelector::emitAddSub(
4317     const std::array<std::array<unsigned, 2>, 5> &AddrModeAndSizeToOpcode,
4318     Register Dst, MachineOperand &LHS, MachineOperand &RHS,
4319     MachineIRBuilder &MIRBuilder) const {
4320   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4321   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4322   auto Ty = MRI.getType(LHS.getReg());
4323   assert(!Ty.isVector() && "Expected a scalar or pointer?");
4324   unsigned Size = Ty.getSizeInBits();
4325   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit type only");
4326   bool Is32Bit = Size == 32;
4327 
4328   // INSTRri form with positive arithmetic immediate.
4329   if (auto Fns = selectArithImmed(RHS))
4330     return emitInstr(AddrModeAndSizeToOpcode[0][Is32Bit], {Dst}, {LHS},
4331                      MIRBuilder, Fns);
4332 
4333   // INSTRri form with negative arithmetic immediate.
4334   if (auto Fns = selectNegArithImmed(RHS))
4335     return emitInstr(AddrModeAndSizeToOpcode[3][Is32Bit], {Dst}, {LHS},
4336                      MIRBuilder, Fns);
4337 
4338   // INSTRrx form.
4339   if (auto Fns = selectArithExtendedRegister(RHS))
4340     return emitInstr(AddrModeAndSizeToOpcode[4][Is32Bit], {Dst}, {LHS},
4341                      MIRBuilder, Fns);
4342 
4343   // INSTRrs form.
4344   if (auto Fns = selectShiftedRegister(RHS))
4345     return emitInstr(AddrModeAndSizeToOpcode[1][Is32Bit], {Dst}, {LHS},
4346                      MIRBuilder, Fns);
4347   return emitInstr(AddrModeAndSizeToOpcode[2][Is32Bit], {Dst}, {LHS, RHS},
4348                    MIRBuilder);
4349 }
4350 
4351 MachineInstr *
emitADD(Register DefReg,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4352 AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
4353                                     MachineOperand &RHS,
4354                                     MachineIRBuilder &MIRBuilder) const {
4355   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4356       {{AArch64::ADDXri, AArch64::ADDWri},
4357        {AArch64::ADDXrs, AArch64::ADDWrs},
4358        {AArch64::ADDXrr, AArch64::ADDWrr},
4359        {AArch64::SUBXri, AArch64::SUBWri},
4360        {AArch64::ADDXrx, AArch64::ADDWrx}}};
4361   return emitAddSub(OpcTable, DefReg, LHS, RHS, MIRBuilder);
4362 }
4363 
4364 MachineInstr *
emitADDS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4365 AArch64InstructionSelector::emitADDS(Register Dst, MachineOperand &LHS,
4366                                      MachineOperand &RHS,
4367                                      MachineIRBuilder &MIRBuilder) const {
4368   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4369       {{AArch64::ADDSXri, AArch64::ADDSWri},
4370        {AArch64::ADDSXrs, AArch64::ADDSWrs},
4371        {AArch64::ADDSXrr, AArch64::ADDSWrr},
4372        {AArch64::SUBSXri, AArch64::SUBSWri},
4373        {AArch64::ADDSXrx, AArch64::ADDSWrx}}};
4374   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4375 }
4376 
4377 MachineInstr *
emitSUBS(Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4378 AArch64InstructionSelector::emitSUBS(Register Dst, MachineOperand &LHS,
4379                                      MachineOperand &RHS,
4380                                      MachineIRBuilder &MIRBuilder) const {
4381   const std::array<std::array<unsigned, 2>, 5> OpcTable{
4382       {{AArch64::SUBSXri, AArch64::SUBSWri},
4383        {AArch64::SUBSXrs, AArch64::SUBSWrs},
4384        {AArch64::SUBSXrr, AArch64::SUBSWrr},
4385        {AArch64::ADDSXri, AArch64::ADDSWri},
4386        {AArch64::SUBSXrx, AArch64::SUBSWrx}}};
4387   return emitAddSub(OpcTable, Dst, LHS, RHS, MIRBuilder);
4388 }
4389 
4390 MachineInstr *
emitCMN(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4391 AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
4392                                     MachineIRBuilder &MIRBuilder) const {
4393   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4394   bool Is32Bit = (MRI.getType(LHS.getReg()).getSizeInBits() == 32);
4395   auto RC = Is32Bit ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
4396   return emitADDS(MRI.createVirtualRegister(RC), LHS, RHS, MIRBuilder);
4397 }
4398 
4399 MachineInstr *
emitTST(MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4400 AArch64InstructionSelector::emitTST(MachineOperand &LHS, MachineOperand &RHS,
4401                                     MachineIRBuilder &MIRBuilder) const {
4402   assert(LHS.isReg() && RHS.isReg() && "Expected register operands?");
4403   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4404   LLT Ty = MRI.getType(LHS.getReg());
4405   unsigned RegSize = Ty.getSizeInBits();
4406   bool Is32Bit = (RegSize == 32);
4407   const unsigned OpcTable[3][2] = {{AArch64::ANDSXri, AArch64::ANDSWri},
4408                                    {AArch64::ANDSXrs, AArch64::ANDSWrs},
4409                                    {AArch64::ANDSXrr, AArch64::ANDSWrr}};
4410   // ANDS needs a logical immediate for its immediate form. Check if we can
4411   // fold one in.
4412   if (auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI)) {
4413     int64_t Imm = ValAndVReg->Value.getSExtValue();
4414 
4415     if (AArch64_AM::isLogicalImmediate(Imm, RegSize)) {
4416       auto TstMI = MIRBuilder.buildInstr(OpcTable[0][Is32Bit], {Ty}, {LHS});
4417       TstMI.addImm(AArch64_AM::encodeLogicalImmediate(Imm, RegSize));
4418       constrainSelectedInstRegOperands(*TstMI, TII, TRI, RBI);
4419       return &*TstMI;
4420     }
4421   }
4422 
4423   if (auto Fns = selectLogicalShiftedRegister(RHS))
4424     return emitInstr(OpcTable[1][Is32Bit], {Ty}, {LHS}, MIRBuilder, Fns);
4425   return emitInstr(OpcTable[2][Is32Bit], {Ty}, {LHS, RHS}, MIRBuilder);
4426 }
4427 
emitIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4428 MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
4429     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4430     MachineIRBuilder &MIRBuilder) const {
4431   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
4432   assert(Predicate.isPredicate() && "Expected predicate?");
4433   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4434   LLT CmpTy = MRI.getType(LHS.getReg());
4435   assert(!CmpTy.isVector() && "Expected scalar or pointer");
4436   unsigned Size = CmpTy.getSizeInBits();
4437   (void)Size;
4438   assert((Size == 32 || Size == 64) && "Expected a 32-bit or 64-bit LHS/RHS?");
4439   // Fold the compare into a cmn or tst if possible.
4440   if (auto FoldCmp = tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder))
4441     return FoldCmp;
4442   auto Dst = MRI.cloneVirtualRegister(LHS.getReg());
4443   return emitSUBS(Dst, LHS, RHS, MIRBuilder);
4444 }
4445 
emitCSetForFCmp(Register Dst,CmpInst::Predicate Pred,MachineIRBuilder & MIRBuilder) const4446 MachineInstr *AArch64InstructionSelector::emitCSetForFCmp(
4447     Register Dst, CmpInst::Predicate Pred, MachineIRBuilder &MIRBuilder) const {
4448   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4449 #ifndef NDEBUG
4450   LLT Ty = MRI.getType(Dst);
4451   assert(!Ty.isVector() && Ty.getSizeInBits() == 32 &&
4452          "Expected a 32-bit scalar register?");
4453 #endif
4454   const Register ZeroReg = AArch64::WZR;
4455   auto EmitCSet = [&](Register CsetDst, AArch64CC::CondCode CC) {
4456     auto CSet =
4457         MIRBuilder.buildInstr(AArch64::CSINCWr, {CsetDst}, {ZeroReg, ZeroReg})
4458             .addImm(getInvertedCondCode(CC));
4459     constrainSelectedInstRegOperands(*CSet, TII, TRI, RBI);
4460     return &*CSet;
4461   };
4462 
4463   AArch64CC::CondCode CC1, CC2;
4464   changeFCMPPredToAArch64CC(Pred, CC1, CC2);
4465   if (CC2 == AArch64CC::AL)
4466     return EmitCSet(Dst, CC1);
4467 
4468   const TargetRegisterClass *RC = &AArch64::GPR32RegClass;
4469   Register Def1Reg = MRI.createVirtualRegister(RC);
4470   Register Def2Reg = MRI.createVirtualRegister(RC);
4471   EmitCSet(Def1Reg, CC1);
4472   EmitCSet(Def2Reg, CC2);
4473   auto OrMI = MIRBuilder.buildInstr(AArch64::ORRWrr, {Dst}, {Def1Reg, Def2Reg});
4474   constrainSelectedInstRegOperands(*OrMI, TII, TRI, RBI);
4475   return &*OrMI;
4476 }
4477 
4478 MachineInstr *
emitFPCompare(Register LHS,Register RHS,MachineIRBuilder & MIRBuilder,Optional<CmpInst::Predicate> Pred) const4479 AArch64InstructionSelector::emitFPCompare(Register LHS, Register RHS,
4480                                           MachineIRBuilder &MIRBuilder,
4481                                           Optional<CmpInst::Predicate> Pred) const {
4482   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4483   LLT Ty = MRI.getType(LHS);
4484   if (Ty.isVector())
4485     return nullptr;
4486   unsigned OpSize = Ty.getSizeInBits();
4487   if (OpSize != 32 && OpSize != 64)
4488     return nullptr;
4489 
4490   // If this is a compare against +0.0, then we don't have
4491   // to explicitly materialize a constant.
4492   const ConstantFP *FPImm = getConstantFPVRegVal(RHS, MRI);
4493   bool ShouldUseImm = FPImm && (FPImm->isZero() && !FPImm->isNegative());
4494 
4495   auto IsEqualityPred = [](CmpInst::Predicate P) {
4496     return P == CmpInst::FCMP_OEQ || P == CmpInst::FCMP_ONE ||
4497            P == CmpInst::FCMP_UEQ || P == CmpInst::FCMP_UNE;
4498   };
4499   if (!ShouldUseImm && Pred && IsEqualityPred(*Pred)) {
4500     // Try commutating the operands.
4501     const ConstantFP *LHSImm = getConstantFPVRegVal(LHS, MRI);
4502     if (LHSImm && (LHSImm->isZero() && !LHSImm->isNegative())) {
4503       ShouldUseImm = true;
4504       std::swap(LHS, RHS);
4505     }
4506   }
4507   unsigned CmpOpcTbl[2][2] = {{AArch64::FCMPSrr, AArch64::FCMPDrr},
4508                               {AArch64::FCMPSri, AArch64::FCMPDri}};
4509   unsigned CmpOpc = CmpOpcTbl[ShouldUseImm][OpSize == 64];
4510 
4511   // Partially build the compare. Decide if we need to add a use for the
4512   // third operand based off whether or not we're comparing against 0.0.
4513   auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addUse(LHS);
4514   if (!ShouldUseImm)
4515     CmpMI.addUse(RHS);
4516   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
4517   return &*CmpMI;
4518 }
4519 
emitVectorConcat(Optional<Register> Dst,Register Op1,Register Op2,MachineIRBuilder & MIRBuilder) const4520 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
4521     Optional<Register> Dst, Register Op1, Register Op2,
4522     MachineIRBuilder &MIRBuilder) const {
4523   // We implement a vector concat by:
4524   // 1. Use scalar_to_vector to insert the lower vector into the larger dest
4525   // 2. Insert the upper vector into the destination's upper element
4526   // TODO: some of this code is common with G_BUILD_VECTOR handling.
4527   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
4528 
4529   const LLT Op1Ty = MRI.getType(Op1);
4530   const LLT Op2Ty = MRI.getType(Op2);
4531 
4532   if (Op1Ty != Op2Ty) {
4533     LLVM_DEBUG(dbgs() << "Could not do vector concat of differing vector tys");
4534     return nullptr;
4535   }
4536   assert(Op1Ty.isVector() && "Expected a vector for vector concat");
4537 
4538   if (Op1Ty.getSizeInBits() >= 128) {
4539     LLVM_DEBUG(dbgs() << "Vector concat not supported for full size vectors");
4540     return nullptr;
4541   }
4542 
4543   // At the moment we just support 64 bit vector concats.
4544   if (Op1Ty.getSizeInBits() != 64) {
4545     LLVM_DEBUG(dbgs() << "Vector concat supported for 64b vectors");
4546     return nullptr;
4547   }
4548 
4549   const LLT ScalarTy = LLT::scalar(Op1Ty.getSizeInBits());
4550   const RegisterBank &FPRBank = *RBI.getRegBank(Op1, MRI, TRI);
4551   const TargetRegisterClass *DstRC =
4552       getMinClassForRegBank(FPRBank, Op1Ty.getSizeInBits() * 2);
4553 
4554   MachineInstr *WidenedOp1 =
4555       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op1, MIRBuilder);
4556   MachineInstr *WidenedOp2 =
4557       emitScalarToVector(ScalarTy.getSizeInBits(), DstRC, Op2, MIRBuilder);
4558   if (!WidenedOp1 || !WidenedOp2) {
4559     LLVM_DEBUG(dbgs() << "Could not emit a vector from scalar value");
4560     return nullptr;
4561   }
4562 
4563   // Now do the insert of the upper element.
4564   unsigned InsertOpc, InsSubRegIdx;
4565   std::tie(InsertOpc, InsSubRegIdx) =
4566       getInsertVecEltOpInfo(FPRBank, ScalarTy.getSizeInBits());
4567 
4568   if (!Dst)
4569     Dst = MRI.createVirtualRegister(DstRC);
4570   auto InsElt =
4571       MIRBuilder
4572           .buildInstr(InsertOpc, {*Dst}, {WidenedOp1->getOperand(0).getReg()})
4573           .addImm(1) /* Lane index */
4574           .addUse(WidenedOp2->getOperand(0).getReg())
4575           .addImm(0);
4576   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4577   return &*InsElt;
4578 }
4579 
4580 MachineInstr *
emitCSetForICMP(Register DefReg,unsigned Pred,MachineIRBuilder & MIRBuilder,Register SrcReg) const4581 AArch64InstructionSelector::emitCSetForICMP(Register DefReg, unsigned Pred,
4582                                             MachineIRBuilder &MIRBuilder,
4583                                             Register SrcReg) const {
4584   // CSINC increments the result when the predicate is false. Invert it.
4585   const AArch64CC::CondCode InvCC = changeICMPPredToAArch64CC(
4586       CmpInst::getInversePredicate((CmpInst::Predicate)Pred));
4587   auto I = MIRBuilder.buildInstr(AArch64::CSINCWr, {DefReg}, {SrcReg, SrcReg})
4588                .addImm(InvCC);
4589   constrainSelectedInstRegOperands(*I, TII, TRI, RBI);
4590   return &*I;
4591 }
4592 
4593 std::pair<MachineInstr *, AArch64CC::CondCode>
emitOverflowOp(unsigned Opcode,Register Dst,MachineOperand & LHS,MachineOperand & RHS,MachineIRBuilder & MIRBuilder) const4594 AArch64InstructionSelector::emitOverflowOp(unsigned Opcode, Register Dst,
4595                                            MachineOperand &LHS,
4596                                            MachineOperand &RHS,
4597                                            MachineIRBuilder &MIRBuilder) const {
4598   switch (Opcode) {
4599   default:
4600     llvm_unreachable("Unexpected opcode!");
4601   case TargetOpcode::G_SADDO:
4602     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4603   case TargetOpcode::G_UADDO:
4604     return std::make_pair(emitADDS(Dst, LHS, RHS, MIRBuilder), AArch64CC::HS);
4605   case TargetOpcode::G_SSUBO:
4606     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::VS);
4607   case TargetOpcode::G_USUBO:
4608     return std::make_pair(emitSUBS(Dst, LHS, RHS, MIRBuilder), AArch64CC::LO);
4609   }
4610 }
4611 
tryOptSelect(MachineInstr & I)4612 bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) {
4613   MachineRegisterInfo &MRI = *MIB.getMRI();
4614   // We want to recognize this pattern:
4615   //
4616   // $z = G_FCMP pred, $x, $y
4617   // ...
4618   // $w = G_SELECT $z, $a, $b
4619   //
4620   // Where the value of $z is *only* ever used by the G_SELECT (possibly with
4621   // some copies/truncs in between.)
4622   //
4623   // If we see this, then we can emit something like this:
4624   //
4625   // fcmp $x, $y
4626   // fcsel $w, $a, $b, pred
4627   //
4628   // Rather than emitting both of the rather long sequences in the standard
4629   // G_FCMP/G_SELECT select methods.
4630 
4631   // First, check if the condition is defined by a compare.
4632   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
4633   while (CondDef) {
4634     // We can only fold if all of the defs have one use.
4635     Register CondDefReg = CondDef->getOperand(0).getReg();
4636     if (!MRI.hasOneNonDBGUse(CondDefReg)) {
4637       // Unless it's another select.
4638       for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
4639         if (CondDef == &UI)
4640           continue;
4641         if (UI.getOpcode() != TargetOpcode::G_SELECT)
4642           return false;
4643       }
4644     }
4645 
4646     // We can skip over G_TRUNC since the condition is 1-bit.
4647     // Truncating/extending can have no impact on the value.
4648     unsigned Opc = CondDef->getOpcode();
4649     if (Opc != TargetOpcode::COPY && Opc != TargetOpcode::G_TRUNC)
4650       break;
4651 
4652     // Can't see past copies from physregs.
4653     if (Opc == TargetOpcode::COPY &&
4654         Register::isPhysicalRegister(CondDef->getOperand(1).getReg()))
4655       return false;
4656 
4657     CondDef = MRI.getVRegDef(CondDef->getOperand(1).getReg());
4658   }
4659 
4660   // Is the condition defined by a compare?
4661   if (!CondDef)
4662     return false;
4663 
4664   unsigned CondOpc = CondDef->getOpcode();
4665   if (CondOpc != TargetOpcode::G_ICMP && CondOpc != TargetOpcode::G_FCMP)
4666     return false;
4667 
4668   AArch64CC::CondCode CondCode;
4669   if (CondOpc == TargetOpcode::G_ICMP) {
4670     auto Pred =
4671         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4672     CondCode = changeICMPPredToAArch64CC(Pred);
4673     emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
4674                        CondDef->getOperand(1), MIB);
4675   } else {
4676     // Get the condition code for the select.
4677     auto Pred =
4678         static_cast<CmpInst::Predicate>(CondDef->getOperand(1).getPredicate());
4679     AArch64CC::CondCode CondCode2;
4680     changeFCMPPredToAArch64CC(Pred, CondCode, CondCode2);
4681 
4682     // changeFCMPPredToAArch64CC sets CondCode2 to AL when we require two
4683     // instructions to emit the comparison.
4684     // TODO: Handle FCMP_UEQ and FCMP_ONE. After that, this check will be
4685     // unnecessary.
4686     if (CondCode2 != AArch64CC::AL)
4687       return false;
4688 
4689     if (!emitFPCompare(CondDef->getOperand(2).getReg(),
4690                        CondDef->getOperand(3).getReg(), MIB)) {
4691       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
4692       return false;
4693     }
4694   }
4695 
4696   // Emit the select.
4697   emitSelect(I.getOperand(0).getReg(), I.getOperand(2).getReg(),
4698              I.getOperand(3).getReg(), CondCode, MIB);
4699   I.eraseFromParent();
4700   return true;
4701 }
4702 
tryFoldIntegerCompare(MachineOperand & LHS,MachineOperand & RHS,MachineOperand & Predicate,MachineIRBuilder & MIRBuilder) const4703 MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
4704     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
4705     MachineIRBuilder &MIRBuilder) const {
4706   assert(LHS.isReg() && RHS.isReg() && Predicate.isPredicate() &&
4707          "Unexpected MachineOperand");
4708   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4709   // We want to find this sort of thing:
4710   // x = G_SUB 0, y
4711   // G_ICMP z, x
4712   //
4713   // In this case, we can fold the G_SUB into the G_ICMP using a CMN instead.
4714   // e.g:
4715   //
4716   // cmn z, y
4717 
4718   // Check if the RHS or LHS of the G_ICMP is defined by a SUB
4719   MachineInstr *LHSDef = getDefIgnoringCopies(LHS.getReg(), MRI);
4720   MachineInstr *RHSDef = getDefIgnoringCopies(RHS.getReg(), MRI);
4721   auto P = static_cast<CmpInst::Predicate>(Predicate.getPredicate());
4722   // Given this:
4723   //
4724   // x = G_SUB 0, y
4725   // G_ICMP x, z
4726   //
4727   // Produce this:
4728   //
4729   // cmn y, z
4730   if (isCMN(LHSDef, P, MRI))
4731     return emitCMN(LHSDef->getOperand(2), RHS, MIRBuilder);
4732 
4733   // Same idea here, but with the RHS of the compare instead:
4734   //
4735   // Given this:
4736   //
4737   // x = G_SUB 0, y
4738   // G_ICMP z, x
4739   //
4740   // Produce this:
4741   //
4742   // cmn z, y
4743   if (isCMN(RHSDef, P, MRI))
4744     return emitCMN(LHS, RHSDef->getOperand(2), MIRBuilder);
4745 
4746   // Given this:
4747   //
4748   // z = G_AND x, y
4749   // G_ICMP z, 0
4750   //
4751   // Produce this if the compare is signed:
4752   //
4753   // tst x, y
4754   if (!CmpInst::isUnsigned(P) && LHSDef &&
4755       LHSDef->getOpcode() == TargetOpcode::G_AND) {
4756     // Make sure that the RHS is 0.
4757     auto ValAndVReg = getIConstantVRegValWithLookThrough(RHS.getReg(), MRI);
4758     if (!ValAndVReg || ValAndVReg->Value != 0)
4759       return nullptr;
4760 
4761     return emitTST(LHSDef->getOperand(1),
4762                    LHSDef->getOperand(2), MIRBuilder);
4763   }
4764 
4765   return nullptr;
4766 }
4767 
selectShuffleVector(MachineInstr & I,MachineRegisterInfo & MRI)4768 bool AArch64InstructionSelector::selectShuffleVector(
4769     MachineInstr &I, MachineRegisterInfo &MRI) {
4770   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
4771   Register Src1Reg = I.getOperand(1).getReg();
4772   const LLT Src1Ty = MRI.getType(Src1Reg);
4773   Register Src2Reg = I.getOperand(2).getReg();
4774   const LLT Src2Ty = MRI.getType(Src2Reg);
4775   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
4776 
4777   MachineBasicBlock &MBB = *I.getParent();
4778   MachineFunction &MF = *MBB.getParent();
4779   LLVMContext &Ctx = MF.getFunction().getContext();
4780 
4781   // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
4782   // it's originated from a <1 x T> type. Those should have been lowered into
4783   // G_BUILD_VECTOR earlier.
4784   if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
4785     LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
4786     return false;
4787   }
4788 
4789   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
4790 
4791   SmallVector<Constant *, 64> CstIdxs;
4792   for (int Val : Mask) {
4793     // For now, any undef indexes we'll just assume to be 0. This should be
4794     // optimized in future, e.g. to select DUP etc.
4795     Val = Val < 0 ? 0 : Val;
4796     for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
4797       unsigned Offset = Byte + Val * BytesPerElt;
4798       CstIdxs.emplace_back(ConstantInt::get(Type::getInt8Ty(Ctx), Offset));
4799     }
4800   }
4801 
4802   // Use a constant pool to load the index vector for TBL.
4803   Constant *CPVal = ConstantVector::get(CstIdxs);
4804   MachineInstr *IndexLoad = emitLoadFromConstantPool(CPVal, MIB);
4805   if (!IndexLoad) {
4806     LLVM_DEBUG(dbgs() << "Could not load from a constant pool");
4807     return false;
4808   }
4809 
4810   if (DstTy.getSizeInBits() != 128) {
4811     assert(DstTy.getSizeInBits() == 64 && "Unexpected shuffle result ty");
4812     // This case can be done with TBL1.
4813     MachineInstr *Concat = emitVectorConcat(None, Src1Reg, Src2Reg, MIB);
4814     if (!Concat) {
4815       LLVM_DEBUG(dbgs() << "Could not do vector concat for tbl1");
4816       return false;
4817     }
4818 
4819     // The constant pool load will be 64 bits, so need to convert to FPR128 reg.
4820     IndexLoad = emitScalarToVector(64, &AArch64::FPR128RegClass,
4821                                    IndexLoad->getOperand(0).getReg(), MIB);
4822 
4823     auto TBL1 = MIB.buildInstr(
4824         AArch64::TBLv16i8One, {&AArch64::FPR128RegClass},
4825         {Concat->getOperand(0).getReg(), IndexLoad->getOperand(0).getReg()});
4826     constrainSelectedInstRegOperands(*TBL1, TII, TRI, RBI);
4827 
4828     auto Copy =
4829         MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(0).getReg()}, {})
4830             .addReg(TBL1.getReg(0), 0, AArch64::dsub);
4831     RBI.constrainGenericRegister(Copy.getReg(0), AArch64::FPR64RegClass, MRI);
4832     I.eraseFromParent();
4833     return true;
4834   }
4835 
4836   // For TBL2 we need to emit a REG_SEQUENCE to tie together two consecutive
4837   // Q registers for regalloc.
4838   SmallVector<Register, 2> Regs = {Src1Reg, Src2Reg};
4839   auto RegSeq = createQTuple(Regs, MIB);
4840   auto TBL2 = MIB.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
4841                              {RegSeq, IndexLoad->getOperand(0)});
4842   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
4843   I.eraseFromParent();
4844   return true;
4845 }
4846 
emitLaneInsert(Optional<Register> DstReg,Register SrcReg,Register EltReg,unsigned LaneIdx,const RegisterBank & RB,MachineIRBuilder & MIRBuilder) const4847 MachineInstr *AArch64InstructionSelector::emitLaneInsert(
4848     Optional<Register> DstReg, Register SrcReg, Register EltReg,
4849     unsigned LaneIdx, const RegisterBank &RB,
4850     MachineIRBuilder &MIRBuilder) const {
4851   MachineInstr *InsElt = nullptr;
4852   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
4853   MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
4854 
4855   // Create a register to define with the insert if one wasn't passed in.
4856   if (!DstReg)
4857     DstReg = MRI.createVirtualRegister(DstRC);
4858 
4859   unsigned EltSize = MRI.getType(EltReg).getSizeInBits();
4860   unsigned Opc = getInsertVecEltOpInfo(RB, EltSize).first;
4861 
4862   if (RB.getID() == AArch64::FPRRegBankID) {
4863     auto InsSub = emitScalarToVector(EltSize, DstRC, EltReg, MIRBuilder);
4864     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4865                  .addImm(LaneIdx)
4866                  .addUse(InsSub->getOperand(0).getReg())
4867                  .addImm(0);
4868   } else {
4869     InsElt = MIRBuilder.buildInstr(Opc, {*DstReg}, {SrcReg})
4870                  .addImm(LaneIdx)
4871                  .addUse(EltReg);
4872   }
4873 
4874   constrainSelectedInstRegOperands(*InsElt, TII, TRI, RBI);
4875   return InsElt;
4876 }
4877 
selectUSMovFromExtend(MachineInstr & MI,MachineRegisterInfo & MRI)4878 bool AArch64InstructionSelector::selectUSMovFromExtend(
4879     MachineInstr &MI, MachineRegisterInfo &MRI) {
4880   if (MI.getOpcode() != TargetOpcode::G_SEXT &&
4881       MI.getOpcode() != TargetOpcode::G_ZEXT &&
4882       MI.getOpcode() != TargetOpcode::G_ANYEXT)
4883     return false;
4884   bool IsSigned = MI.getOpcode() == TargetOpcode::G_SEXT;
4885   const Register DefReg = MI.getOperand(0).getReg();
4886   const LLT DstTy = MRI.getType(DefReg);
4887   unsigned DstSize = DstTy.getSizeInBits();
4888 
4889   if (DstSize != 32 && DstSize != 64)
4890     return false;
4891 
4892   MachineInstr *Extract = getOpcodeDef(TargetOpcode::G_EXTRACT_VECTOR_ELT,
4893                                        MI.getOperand(1).getReg(), MRI);
4894   int64_t Lane;
4895   if (!Extract || !mi_match(Extract->getOperand(2).getReg(), MRI, m_ICst(Lane)))
4896     return false;
4897   Register Src0 = Extract->getOperand(1).getReg();
4898 
4899   const LLT &VecTy = MRI.getType(Src0);
4900 
4901   if (VecTy.getSizeInBits() != 128) {
4902     const MachineInstr *ScalarToVector = emitScalarToVector(
4903         VecTy.getSizeInBits(), &AArch64::FPR128RegClass, Src0, MIB);
4904     assert(ScalarToVector && "Didn't expect emitScalarToVector to fail!");
4905     Src0 = ScalarToVector->getOperand(0).getReg();
4906   }
4907 
4908   unsigned Opcode;
4909   if (DstSize == 64 && VecTy.getScalarSizeInBits() == 32)
4910     Opcode = IsSigned ? AArch64::SMOVvi32to64 : AArch64::UMOVvi32;
4911   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 16)
4912     Opcode = IsSigned ? AArch64::SMOVvi16to64 : AArch64::UMOVvi16;
4913   else if (DstSize == 64 && VecTy.getScalarSizeInBits() == 8)
4914     Opcode = IsSigned ? AArch64::SMOVvi8to64 : AArch64::UMOVvi8;
4915   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 16)
4916     Opcode = IsSigned ? AArch64::SMOVvi16to32 : AArch64::UMOVvi16;
4917   else if (DstSize == 32 && VecTy.getScalarSizeInBits() == 8)
4918     Opcode = IsSigned ? AArch64::SMOVvi8to32 : AArch64::UMOVvi8;
4919   else
4920     llvm_unreachable("Unexpected type combo for S/UMov!");
4921 
4922   // We may need to generate one of these, depending on the type and sign of the
4923   // input:
4924   //  DstReg = SMOV Src0, Lane;
4925   //  NewReg = UMOV Src0, Lane; DstReg = SUBREG_TO_REG NewReg, sub_32;
4926   MachineInstr *ExtI = nullptr;
4927   if (DstSize == 64 && !IsSigned) {
4928     Register NewReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
4929     MIB.buildInstr(Opcode, {NewReg}, {Src0}).addImm(Lane);
4930     ExtI = MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
4931                .addImm(0)
4932                .addUse(NewReg)
4933                .addImm(AArch64::sub_32);
4934     RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
4935   } else
4936     ExtI = MIB.buildInstr(Opcode, {DefReg}, {Src0}).addImm(Lane);
4937 
4938   constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
4939   MI.eraseFromParent();
4940   return true;
4941 }
4942 
selectInsertElt(MachineInstr & I,MachineRegisterInfo & MRI)4943 bool AArch64InstructionSelector::selectInsertElt(MachineInstr &I,
4944                                                  MachineRegisterInfo &MRI) {
4945   assert(I.getOpcode() == TargetOpcode::G_INSERT_VECTOR_ELT);
4946 
4947   // Get information on the destination.
4948   Register DstReg = I.getOperand(0).getReg();
4949   const LLT DstTy = MRI.getType(DstReg);
4950   unsigned VecSize = DstTy.getSizeInBits();
4951 
4952   // Get information on the element we want to insert into the destination.
4953   Register EltReg = I.getOperand(2).getReg();
4954   const LLT EltTy = MRI.getType(EltReg);
4955   unsigned EltSize = EltTy.getSizeInBits();
4956   if (EltSize < 16 || EltSize > 64)
4957     return false; // Don't support all element types yet.
4958 
4959   // Find the definition of the index. Bail out if it's not defined by a
4960   // G_CONSTANT.
4961   Register IdxReg = I.getOperand(3).getReg();
4962   auto VRegAndVal = getIConstantVRegValWithLookThrough(IdxReg, MRI);
4963   if (!VRegAndVal)
4964     return false;
4965   unsigned LaneIdx = VRegAndVal->Value.getSExtValue();
4966 
4967   // Perform the lane insert.
4968   Register SrcReg = I.getOperand(1).getReg();
4969   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
4970 
4971   if (VecSize < 128) {
4972     // If the vector we're inserting into is smaller than 128 bits, widen it
4973     // to 128 to do the insert.
4974     MachineInstr *ScalarToVec =
4975         emitScalarToVector(VecSize, &AArch64::FPR128RegClass, SrcReg, MIB);
4976     if (!ScalarToVec)
4977       return false;
4978     SrcReg = ScalarToVec->getOperand(0).getReg();
4979   }
4980 
4981   // Create an insert into a new FPR128 register.
4982   // Note that if our vector is already 128 bits, we end up emitting an extra
4983   // register.
4984   MachineInstr *InsMI =
4985       emitLaneInsert(None, SrcReg, EltReg, LaneIdx, EltRB, MIB);
4986 
4987   if (VecSize < 128) {
4988     // If we had to widen to perform the insert, then we have to demote back to
4989     // the original size to get the result we want.
4990     Register DemoteVec = InsMI->getOperand(0).getReg();
4991     const TargetRegisterClass *RC =
4992         getMinClassForRegBank(*RBI.getRegBank(DemoteVec, MRI, TRI), VecSize);
4993     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
4994       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
4995       return false;
4996     }
4997     unsigned SubReg = 0;
4998     if (!getSubRegForClass(RC, TRI, SubReg))
4999       return false;
5000     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5001       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << VecSize
5002                         << "\n");
5003       return false;
5004     }
5005     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {})
5006         .addReg(DemoteVec, 0, SubReg);
5007     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5008   } else {
5009     // No widening needed.
5010     InsMI->getOperand(0).setReg(DstReg);
5011     constrainSelectedInstRegOperands(*InsMI, TII, TRI, RBI);
5012   }
5013 
5014   I.eraseFromParent();
5015   return true;
5016 }
5017 
5018 MachineInstr *
emitConstantVector(Register Dst,Constant * CV,MachineIRBuilder & MIRBuilder,MachineRegisterInfo & MRI)5019 AArch64InstructionSelector::emitConstantVector(Register Dst, Constant *CV,
5020                                                MachineIRBuilder &MIRBuilder,
5021                                                MachineRegisterInfo &MRI) {
5022   LLT DstTy = MRI.getType(Dst);
5023   unsigned DstSize = DstTy.getSizeInBits();
5024   if (CV->isNullValue()) {
5025     if (DstSize == 128) {
5026       auto Mov =
5027           MIRBuilder.buildInstr(AArch64::MOVIv2d_ns, {Dst}, {}).addImm(0);
5028       constrainSelectedInstRegOperands(*Mov, TII, TRI, RBI);
5029       return &*Mov;
5030     }
5031 
5032     if (DstSize == 64) {
5033       auto Mov =
5034           MIRBuilder
5035               .buildInstr(AArch64::MOVIv2d_ns, {&AArch64::FPR128RegClass}, {})
5036               .addImm(0);
5037       auto Copy = MIRBuilder.buildInstr(TargetOpcode::COPY, {Dst}, {})
5038                       .addReg(Mov.getReg(0), 0, AArch64::dsub);
5039       RBI.constrainGenericRegister(Dst, AArch64::FPR64RegClass, MRI);
5040       return &*Copy;
5041     }
5042   }
5043 
5044   auto *CPLoad = emitLoadFromConstantPool(CV, MIRBuilder);
5045   if (!CPLoad) {
5046     LLVM_DEBUG(dbgs() << "Could not generate cp load for constant vector!");
5047     return nullptr;
5048   }
5049 
5050   auto Copy = MIRBuilder.buildCopy(Dst, CPLoad->getOperand(0));
5051   RBI.constrainGenericRegister(
5052       Dst, *MRI.getRegClass(CPLoad->getOperand(0).getReg()), MRI);
5053   return &*Copy;
5054 }
5055 
tryOptConstantBuildVec(MachineInstr & I,LLT DstTy,MachineRegisterInfo & MRI)5056 bool AArch64InstructionSelector::tryOptConstantBuildVec(
5057     MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) {
5058   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5059   unsigned DstSize = DstTy.getSizeInBits();
5060   assert(DstSize <= 128 && "Unexpected build_vec type!");
5061   if (DstSize < 32)
5062     return false;
5063   // Check if we're building a constant vector, in which case we want to
5064   // generate a constant pool load instead of a vector insert sequence.
5065   SmallVector<Constant *, 16> Csts;
5066   for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
5067     // Try to find G_CONSTANT or G_FCONSTANT
5068     auto *OpMI =
5069         getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
5070     if (OpMI)
5071       Csts.emplace_back(
5072           const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
5073     else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
5074                                   I.getOperand(Idx).getReg(), MRI)))
5075       Csts.emplace_back(
5076           const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
5077     else
5078       return false;
5079   }
5080   Constant *CV = ConstantVector::get(Csts);
5081   if (!emitConstantVector(I.getOperand(0).getReg(), CV, MIB, MRI))
5082     return false;
5083   I.eraseFromParent();
5084   return true;
5085 }
5086 
tryOptBuildVecToSubregToReg(MachineInstr & I,MachineRegisterInfo & MRI)5087 bool AArch64InstructionSelector::tryOptBuildVecToSubregToReg(
5088     MachineInstr &I, MachineRegisterInfo &MRI) {
5089   // Given:
5090   //  %vec = G_BUILD_VECTOR %elt, %undef, %undef, ... %undef
5091   //
5092   // Select the G_BUILD_VECTOR as a SUBREG_TO_REG from %elt.
5093   Register Dst = I.getOperand(0).getReg();
5094   Register EltReg = I.getOperand(1).getReg();
5095   LLT EltTy = MRI.getType(EltReg);
5096   // If the index isn't on the same bank as its elements, then this can't be a
5097   // SUBREG_TO_REG.
5098   const RegisterBank &EltRB = *RBI.getRegBank(EltReg, MRI, TRI);
5099   const RegisterBank &DstRB = *RBI.getRegBank(Dst, MRI, TRI);
5100   if (EltRB != DstRB)
5101     return false;
5102   if (any_of(make_range(I.operands_begin() + 2, I.operands_end()),
5103              [&MRI](const MachineOperand &Op) {
5104                return !getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, Op.getReg(),
5105                                     MRI);
5106              }))
5107     return false;
5108   unsigned SubReg;
5109   const TargetRegisterClass *EltRC =
5110       getMinClassForRegBank(EltRB, EltTy.getSizeInBits());
5111   if (!EltRC)
5112     return false;
5113   const TargetRegisterClass *DstRC =
5114       getMinClassForRegBank(DstRB, MRI.getType(Dst).getSizeInBits());
5115   if (!DstRC)
5116     return false;
5117   if (!getSubRegForClass(EltRC, TRI, SubReg))
5118     return false;
5119   auto SubregToReg = MIB.buildInstr(AArch64::SUBREG_TO_REG, {Dst}, {})
5120                          .addImm(0)
5121                          .addUse(EltReg)
5122                          .addImm(SubReg);
5123   I.eraseFromParent();
5124   constrainSelectedInstRegOperands(*SubregToReg, TII, TRI, RBI);
5125   return RBI.constrainGenericRegister(Dst, *DstRC, MRI);
5126 }
5127 
selectBuildVector(MachineInstr & I,MachineRegisterInfo & MRI)5128 bool AArch64InstructionSelector::selectBuildVector(MachineInstr &I,
5129                                                    MachineRegisterInfo &MRI) {
5130   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
5131   // Until we port more of the optimized selections, for now just use a vector
5132   // insert sequence.
5133   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
5134   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
5135   unsigned EltSize = EltTy.getSizeInBits();
5136 
5137   if (tryOptConstantBuildVec(I, DstTy, MRI))
5138     return true;
5139   if (tryOptBuildVecToSubregToReg(I, MRI))
5140     return true;
5141 
5142   if (EltSize < 16 || EltSize > 64)
5143     return false; // Don't support all element types yet.
5144   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
5145 
5146   const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
5147   MachineInstr *ScalarToVec =
5148       emitScalarToVector(DstTy.getElementType().getSizeInBits(), DstRC,
5149                          I.getOperand(1).getReg(), MIB);
5150   if (!ScalarToVec)
5151     return false;
5152 
5153   Register DstVec = ScalarToVec->getOperand(0).getReg();
5154   unsigned DstSize = DstTy.getSizeInBits();
5155 
5156   // Keep track of the last MI we inserted. Later on, we might be able to save
5157   // a copy using it.
5158   MachineInstr *PrevMI = nullptr;
5159   for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
5160     // Note that if we don't do a subregister copy, we can end up making an
5161     // extra register.
5162     PrevMI = &*emitLaneInsert(None, DstVec, I.getOperand(i).getReg(), i - 1, RB,
5163                               MIB);
5164     DstVec = PrevMI->getOperand(0).getReg();
5165   }
5166 
5167   // If DstTy's size in bits is less than 128, then emit a subregister copy
5168   // from DstVec to the last register we've defined.
5169   if (DstSize < 128) {
5170     // Force this to be FPR using the destination vector.
5171     const TargetRegisterClass *RC =
5172         getMinClassForRegBank(*RBI.getRegBank(DstVec, MRI, TRI), DstSize);
5173     if (!RC)
5174       return false;
5175     if (RC != &AArch64::FPR32RegClass && RC != &AArch64::FPR64RegClass) {
5176       LLVM_DEBUG(dbgs() << "Unsupported register class!\n");
5177       return false;
5178     }
5179 
5180     unsigned SubReg = 0;
5181     if (!getSubRegForClass(RC, TRI, SubReg))
5182       return false;
5183     if (SubReg != AArch64::ssub && SubReg != AArch64::dsub) {
5184       LLVM_DEBUG(dbgs() << "Unsupported destination size! (" << DstSize
5185                         << "\n");
5186       return false;
5187     }
5188 
5189     Register Reg = MRI.createVirtualRegister(RC);
5190     Register DstReg = I.getOperand(0).getReg();
5191 
5192     MIB.buildInstr(TargetOpcode::COPY, {DstReg}, {}).addReg(DstVec, 0, SubReg);
5193     MachineOperand &RegOp = I.getOperand(1);
5194     RegOp.setReg(Reg);
5195     RBI.constrainGenericRegister(DstReg, *RC, MRI);
5196   } else {
5197     // We don't need a subregister copy. Save a copy by re-using the
5198     // destination register on the final insert.
5199     assert(PrevMI && "PrevMI was null?");
5200     PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
5201     constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
5202   }
5203 
5204   I.eraseFromParent();
5205   return true;
5206 }
5207 
selectVectorLoadIntrinsic(unsigned Opc,unsigned NumVecs,MachineInstr & I)5208 bool AArch64InstructionSelector::selectVectorLoadIntrinsic(unsigned Opc,
5209                                                            unsigned NumVecs,
5210                                                            MachineInstr &I) {
5211   assert(I.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
5212   assert(Opc && "Expected an opcode?");
5213   assert(NumVecs > 1 && NumVecs < 5 && "Only support 2, 3, or 4 vectors");
5214   auto &MRI = *MIB.getMRI();
5215   LLT Ty = MRI.getType(I.getOperand(0).getReg());
5216   unsigned Size = Ty.getSizeInBits();
5217   assert((Size == 64 || Size == 128) &&
5218          "Destination must be 64 bits or 128 bits?");
5219   unsigned SubReg = Size == 64 ? AArch64::dsub0 : AArch64::qsub0;
5220   auto Ptr = I.getOperand(I.getNumOperands() - 1).getReg();
5221   assert(MRI.getType(Ptr).isPointer() && "Expected a pointer type?");
5222   auto Load = MIB.buildInstr(Opc, {Ty}, {Ptr});
5223   Load.cloneMemRefs(I);
5224   constrainSelectedInstRegOperands(*Load, TII, TRI, RBI);
5225   Register SelectedLoadDst = Load->getOperand(0).getReg();
5226   for (unsigned Idx = 0; Idx < NumVecs; ++Idx) {
5227     auto Vec = MIB.buildInstr(TargetOpcode::COPY, {I.getOperand(Idx)}, {})
5228                    .addReg(SelectedLoadDst, 0, SubReg + Idx);
5229     // Emit the subreg copies and immediately select them.
5230     // FIXME: We should refactor our copy code into an emitCopy helper and
5231     // clean up uses of this pattern elsewhere in the selector.
5232     selectCopy(*Vec, TII, MRI, TRI, RBI);
5233   }
5234   return true;
5235 }
5236 
selectIntrinsicWithSideEffects(MachineInstr & I,MachineRegisterInfo & MRI)5237 bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
5238     MachineInstr &I, MachineRegisterInfo &MRI) {
5239   // Find the intrinsic ID.
5240   unsigned IntrinID = I.getIntrinsicID();
5241 
5242   const LLT S8 = LLT::scalar(8);
5243   const LLT S16 = LLT::scalar(16);
5244   const LLT S32 = LLT::scalar(32);
5245   const LLT S64 = LLT::scalar(64);
5246   const LLT P0 = LLT::pointer(0, 64);
5247   // Select the instruction.
5248   switch (IntrinID) {
5249   default:
5250     return false;
5251   case Intrinsic::aarch64_ldxp:
5252   case Intrinsic::aarch64_ldaxp: {
5253     auto NewI = MIB.buildInstr(
5254         IntrinID == Intrinsic::aarch64_ldxp ? AArch64::LDXPX : AArch64::LDAXPX,
5255         {I.getOperand(0).getReg(), I.getOperand(1).getReg()},
5256         {I.getOperand(3)});
5257     NewI.cloneMemRefs(I);
5258     constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
5259     break;
5260   }
5261   case Intrinsic::trap:
5262     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(1);
5263     break;
5264   case Intrinsic::debugtrap:
5265     MIB.buildInstr(AArch64::BRK, {}, {}).addImm(0xF000);
5266     break;
5267   case Intrinsic::ubsantrap:
5268     MIB.buildInstr(AArch64::BRK, {}, {})
5269         .addImm(I.getOperand(1).getImm() | ('U' << 8));
5270     break;
5271   case Intrinsic::aarch64_neon_ld2: {
5272     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5273     unsigned Opc = 0;
5274     if (Ty == LLT::fixed_vector(8, S8))
5275       Opc = AArch64::LD2Twov8b;
5276     else if (Ty == LLT::fixed_vector(16, S8))
5277       Opc = AArch64::LD2Twov16b;
5278     else if (Ty == LLT::fixed_vector(4, S16))
5279       Opc = AArch64::LD2Twov4h;
5280     else if (Ty == LLT::fixed_vector(8, S16))
5281       Opc = AArch64::LD2Twov8h;
5282     else if (Ty == LLT::fixed_vector(2, S32))
5283       Opc = AArch64::LD2Twov2s;
5284     else if (Ty == LLT::fixed_vector(4, S32))
5285       Opc = AArch64::LD2Twov4s;
5286     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5287       Opc = AArch64::LD2Twov2d;
5288     else if (Ty == S64 || Ty == P0)
5289       Opc = AArch64::LD1Twov1d;
5290     else
5291       llvm_unreachable("Unexpected type for ld2!");
5292     selectVectorLoadIntrinsic(Opc, 2, I);
5293     break;
5294   }
5295   case Intrinsic::aarch64_neon_ld4: {
5296     LLT Ty = MRI.getType(I.getOperand(0).getReg());
5297     unsigned Opc = 0;
5298     if (Ty == LLT::fixed_vector(8, S8))
5299       Opc = AArch64::LD4Fourv8b;
5300     else if (Ty == LLT::fixed_vector(16, S8))
5301       Opc = AArch64::LD4Fourv16b;
5302     else if (Ty == LLT::fixed_vector(4, S16))
5303       Opc = AArch64::LD4Fourv4h;
5304     else if (Ty == LLT::fixed_vector(8, S16))
5305       Opc = AArch64::LD4Fourv8h;
5306     else if (Ty == LLT::fixed_vector(2, S32))
5307       Opc = AArch64::LD4Fourv2s;
5308     else if (Ty == LLT::fixed_vector(4, S32))
5309       Opc = AArch64::LD4Fourv4s;
5310     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5311       Opc = AArch64::LD4Fourv2d;
5312     else if (Ty == S64 || Ty == P0)
5313       Opc = AArch64::LD1Fourv1d;
5314     else
5315       llvm_unreachable("Unexpected type for ld4!");
5316     selectVectorLoadIntrinsic(Opc, 4, I);
5317     break;
5318   }
5319   case Intrinsic::aarch64_neon_st2: {
5320     Register Src1 = I.getOperand(1).getReg();
5321     Register Src2 = I.getOperand(2).getReg();
5322     Register Ptr = I.getOperand(3).getReg();
5323     LLT Ty = MRI.getType(Src1);
5324     unsigned Opc;
5325     if (Ty == LLT::fixed_vector(8, S8))
5326       Opc = AArch64::ST2Twov8b;
5327     else if (Ty == LLT::fixed_vector(16, S8))
5328       Opc = AArch64::ST2Twov16b;
5329     else if (Ty == LLT::fixed_vector(4, S16))
5330       Opc = AArch64::ST2Twov4h;
5331     else if (Ty == LLT::fixed_vector(8, S16))
5332       Opc = AArch64::ST2Twov8h;
5333     else if (Ty == LLT::fixed_vector(2, S32))
5334       Opc = AArch64::ST2Twov2s;
5335     else if (Ty == LLT::fixed_vector(4, S32))
5336       Opc = AArch64::ST2Twov4s;
5337     else if (Ty == LLT::fixed_vector(2, S64) || Ty == LLT::fixed_vector(2, P0))
5338       Opc = AArch64::ST2Twov2d;
5339     else if (Ty == S64 || Ty == P0)
5340       Opc = AArch64::ST1Twov1d;
5341     else
5342       llvm_unreachable("Unexpected type for st2!");
5343     SmallVector<Register, 2> Regs = {Src1, Src2};
5344     Register Tuple = Ty.getSizeInBits() == 128 ? createQTuple(Regs, MIB)
5345                                                : createDTuple(Regs, MIB);
5346     auto Store = MIB.buildInstr(Opc, {}, {Tuple, Ptr});
5347     Store.cloneMemRefs(I);
5348     constrainSelectedInstRegOperands(*Store, TII, TRI, RBI);
5349     break;
5350   }
5351   }
5352 
5353   I.eraseFromParent();
5354   return true;
5355 }
5356 
selectIntrinsic(MachineInstr & I,MachineRegisterInfo & MRI)5357 bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
5358                                                  MachineRegisterInfo &MRI) {
5359   unsigned IntrinID = I.getIntrinsicID();
5360 
5361   switch (IntrinID) {
5362   default:
5363     break;
5364   case Intrinsic::aarch64_crypto_sha1h: {
5365     Register DstReg = I.getOperand(0).getReg();
5366     Register SrcReg = I.getOperand(2).getReg();
5367 
5368     // FIXME: Should this be an assert?
5369     if (MRI.getType(DstReg).getSizeInBits() != 32 ||
5370         MRI.getType(SrcReg).getSizeInBits() != 32)
5371       return false;
5372 
5373     // The operation has to happen on FPRs. Set up some new FPR registers for
5374     // the source and destination if they are on GPRs.
5375     if (RBI.getRegBank(SrcReg, MRI, TRI)->getID() != AArch64::FPRRegBankID) {
5376       SrcReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5377       MIB.buildCopy({SrcReg}, {I.getOperand(2)});
5378 
5379       // Make sure the copy ends up getting constrained properly.
5380       RBI.constrainGenericRegister(I.getOperand(2).getReg(),
5381                                    AArch64::GPR32RegClass, MRI);
5382     }
5383 
5384     if (RBI.getRegBank(DstReg, MRI, TRI)->getID() != AArch64::FPRRegBankID)
5385       DstReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
5386 
5387     // Actually insert the instruction.
5388     auto SHA1Inst = MIB.buildInstr(AArch64::SHA1Hrr, {DstReg}, {SrcReg});
5389     constrainSelectedInstRegOperands(*SHA1Inst, TII, TRI, RBI);
5390 
5391     // Did we create a new register for the destination?
5392     if (DstReg != I.getOperand(0).getReg()) {
5393       // Yep. Copy the result of the instruction back into the original
5394       // destination.
5395       MIB.buildCopy({I.getOperand(0)}, {DstReg});
5396       RBI.constrainGenericRegister(I.getOperand(0).getReg(),
5397                                    AArch64::GPR32RegClass, MRI);
5398     }
5399 
5400     I.eraseFromParent();
5401     return true;
5402   }
5403   case Intrinsic::frameaddress:
5404   case Intrinsic::returnaddress: {
5405     MachineFunction &MF = *I.getParent()->getParent();
5406     MachineFrameInfo &MFI = MF.getFrameInfo();
5407 
5408     unsigned Depth = I.getOperand(2).getImm();
5409     Register DstReg = I.getOperand(0).getReg();
5410     RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
5411 
5412     if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
5413       if (!MFReturnAddr) {
5414         // Insert the copy from LR/X30 into the entry block, before it can be
5415         // clobbered by anything.
5416         MFI.setReturnAddressIsTaken(true);
5417         MFReturnAddr = getFunctionLiveInPhysReg(MF, TII, AArch64::LR,
5418                                                 AArch64::GPR64RegClass);
5419       }
5420 
5421       if (STI.hasPAuth()) {
5422         MIB.buildInstr(AArch64::XPACI, {DstReg}, {MFReturnAddr});
5423       } else {
5424         MIB.buildCopy({Register(AArch64::LR)}, {MFReturnAddr});
5425         MIB.buildInstr(AArch64::XPACLRI);
5426         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5427       }
5428 
5429       I.eraseFromParent();
5430       return true;
5431     }
5432 
5433     MFI.setFrameAddressIsTaken(true);
5434     Register FrameAddr(AArch64::FP);
5435     while (Depth--) {
5436       Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
5437       auto Ldr =
5438           MIB.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr}).addImm(0);
5439       constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
5440       FrameAddr = NextFrame;
5441     }
5442 
5443     if (IntrinID == Intrinsic::frameaddress)
5444       MIB.buildCopy({DstReg}, {FrameAddr});
5445     else {
5446       MFI.setReturnAddressIsTaken(true);
5447 
5448       if (STI.hasPAuth()) {
5449         Register TmpReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
5450         MIB.buildInstr(AArch64::LDRXui, {TmpReg}, {FrameAddr}).addImm(1);
5451         MIB.buildInstr(AArch64::XPACI, {DstReg}, {TmpReg});
5452       } else {
5453         MIB.buildInstr(AArch64::LDRXui, {Register(AArch64::LR)}, {FrameAddr})
5454             .addImm(1);
5455         MIB.buildInstr(AArch64::XPACLRI);
5456         MIB.buildCopy({DstReg}, {Register(AArch64::LR)});
5457       }
5458     }
5459 
5460     I.eraseFromParent();
5461     return true;
5462   }
5463   case Intrinsic::swift_async_context_addr:
5464     auto Sub = MIB.buildInstr(AArch64::SUBXri, {I.getOperand(0).getReg()},
5465                               {Register(AArch64::FP)})
5466                    .addImm(8)
5467                    .addImm(0);
5468     constrainSelectedInstRegOperands(*Sub, TII, TRI, RBI);
5469 
5470     MF->getFrameInfo().setFrameAddressIsTaken(true);
5471     MF->getInfo<AArch64FunctionInfo>()->setHasSwiftAsyncContext(true);
5472     I.eraseFromParent();
5473     return true;
5474   }
5475   return false;
5476 }
5477 
5478 InstructionSelector::ComplexRendererFns
selectShiftA_32(const MachineOperand & Root) const5479 AArch64InstructionSelector::selectShiftA_32(const MachineOperand &Root) const {
5480   auto MaybeImmed = getImmedFromMO(Root);
5481   if (MaybeImmed == None || *MaybeImmed > 31)
5482     return None;
5483   uint64_t Enc = (32 - *MaybeImmed) & 0x1f;
5484   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5485 }
5486 
5487 InstructionSelector::ComplexRendererFns
selectShiftB_32(const MachineOperand & Root) const5488 AArch64InstructionSelector::selectShiftB_32(const MachineOperand &Root) const {
5489   auto MaybeImmed = getImmedFromMO(Root);
5490   if (MaybeImmed == None || *MaybeImmed > 31)
5491     return None;
5492   uint64_t Enc = 31 - *MaybeImmed;
5493   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5494 }
5495 
5496 InstructionSelector::ComplexRendererFns
selectShiftA_64(const MachineOperand & Root) const5497 AArch64InstructionSelector::selectShiftA_64(const MachineOperand &Root) const {
5498   auto MaybeImmed = getImmedFromMO(Root);
5499   if (MaybeImmed == None || *MaybeImmed > 63)
5500     return None;
5501   uint64_t Enc = (64 - *MaybeImmed) & 0x3f;
5502   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5503 }
5504 
5505 InstructionSelector::ComplexRendererFns
selectShiftB_64(const MachineOperand & Root) const5506 AArch64InstructionSelector::selectShiftB_64(const MachineOperand &Root) const {
5507   auto MaybeImmed = getImmedFromMO(Root);
5508   if (MaybeImmed == None || *MaybeImmed > 63)
5509     return None;
5510   uint64_t Enc = 63 - *MaybeImmed;
5511   return {{[=](MachineInstrBuilder &MIB) { MIB.addImm(Enc); }}};
5512 }
5513 
5514 /// Helper to select an immediate value that can be represented as a 12-bit
5515 /// value shifted left by either 0 or 12. If it is possible to do so, return
5516 /// the immediate and shift value. If not, return None.
5517 ///
5518 /// Used by selectArithImmed and selectNegArithImmed.
5519 InstructionSelector::ComplexRendererFns
select12BitValueWithLeftShift(uint64_t Immed) const5520 AArch64InstructionSelector::select12BitValueWithLeftShift(
5521     uint64_t Immed) const {
5522   unsigned ShiftAmt;
5523   if (Immed >> 12 == 0) {
5524     ShiftAmt = 0;
5525   } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
5526     ShiftAmt = 12;
5527     Immed = Immed >> 12;
5528   } else
5529     return None;
5530 
5531   unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
5532   return {{
5533       [=](MachineInstrBuilder &MIB) { MIB.addImm(Immed); },
5534       [=](MachineInstrBuilder &MIB) { MIB.addImm(ShVal); },
5535   }};
5536 }
5537 
5538 /// SelectArithImmed - Select an immediate value that can be represented as
5539 /// a 12-bit value shifted left by either 0 or 12.  If so, return true with
5540 /// Val set to the 12-bit value and Shift set to the shifter operand.
5541 InstructionSelector::ComplexRendererFns
selectArithImmed(MachineOperand & Root) const5542 AArch64InstructionSelector::selectArithImmed(MachineOperand &Root) const {
5543   // This function is called from the addsub_shifted_imm ComplexPattern,
5544   // which lists [imm] as the list of opcode it's interested in, however
5545   // we still need to check whether the operand is actually an immediate
5546   // here because the ComplexPattern opcode list is only used in
5547   // root-level opcode matching.
5548   auto MaybeImmed = getImmedFromMO(Root);
5549   if (MaybeImmed == None)
5550     return None;
5551   return select12BitValueWithLeftShift(*MaybeImmed);
5552 }
5553 
5554 /// SelectNegArithImmed - As above, but negates the value before trying to
5555 /// select it.
5556 InstructionSelector::ComplexRendererFns
selectNegArithImmed(MachineOperand & Root) const5557 AArch64InstructionSelector::selectNegArithImmed(MachineOperand &Root) const {
5558   // We need a register here, because we need to know if we have a 64 or 32
5559   // bit immediate.
5560   if (!Root.isReg())
5561     return None;
5562   auto MaybeImmed = getImmedFromMO(Root);
5563   if (MaybeImmed == None)
5564     return None;
5565   uint64_t Immed = *MaybeImmed;
5566 
5567   // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
5568   // have the opposite effect on the C flag, so this pattern mustn't match under
5569   // those circumstances.
5570   if (Immed == 0)
5571     return None;
5572 
5573   // Check if we're dealing with a 32-bit type on the root or a 64-bit type on
5574   // the root.
5575   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5576   if (MRI.getType(Root.getReg()).getSizeInBits() == 32)
5577     Immed = ~((uint32_t)Immed) + 1;
5578   else
5579     Immed = ~Immed + 1ULL;
5580 
5581   if (Immed & 0xFFFFFFFFFF000000ULL)
5582     return None;
5583 
5584   Immed &= 0xFFFFFFULL;
5585   return select12BitValueWithLeftShift(Immed);
5586 }
5587 
5588 /// Return true if it is worth folding MI into an extended register. That is,
5589 /// if it's safe to pull it into the addressing mode of a load or store as a
5590 /// shift.
isWorthFoldingIntoExtendedReg(MachineInstr & MI,const MachineRegisterInfo & MRI) const5591 bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
5592     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
5593   // Always fold if there is one use, or if we're optimizing for size.
5594   Register DefReg = MI.getOperand(0).getReg();
5595   if (MRI.hasOneNonDBGUse(DefReg) ||
5596       MI.getParent()->getParent()->getFunction().hasOptSize())
5597     return true;
5598 
5599   // It's better to avoid folding and recomputing shifts when we don't have a
5600   // fastpath.
5601   if (!STI.hasLSLFast())
5602     return false;
5603 
5604   // We have a fastpath, so folding a shift in and potentially computing it
5605   // many times may be beneficial. Check if this is only used in memory ops.
5606   // If it is, then we should fold.
5607   return all_of(MRI.use_nodbg_instructions(DefReg),
5608                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
5609 }
5610 
isSignExtendShiftType(AArch64_AM::ShiftExtendType Type)5611 static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
5612   switch (Type) {
5613   case AArch64_AM::SXTB:
5614   case AArch64_AM::SXTH:
5615   case AArch64_AM::SXTW:
5616     return true;
5617   default:
5618     return false;
5619   }
5620 }
5621 
5622 InstructionSelector::ComplexRendererFns
selectExtendedSHL(MachineOperand & Root,MachineOperand & Base,MachineOperand & Offset,unsigned SizeInBytes,bool WantsExt) const5623 AArch64InstructionSelector::selectExtendedSHL(
5624     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
5625     unsigned SizeInBytes, bool WantsExt) const {
5626   assert(Base.isReg() && "Expected base to be a register operand");
5627   assert(Offset.isReg() && "Expected offset to be a register operand");
5628 
5629   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5630   MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
5631   if (!OffsetInst)
5632     return None;
5633 
5634   unsigned OffsetOpc = OffsetInst->getOpcode();
5635   bool LookedThroughZExt = false;
5636   if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) {
5637     // Try to look through a ZEXT.
5638     if (OffsetOpc != TargetOpcode::G_ZEXT || !WantsExt)
5639       return None;
5640 
5641     OffsetInst = MRI.getVRegDef(OffsetInst->getOperand(1).getReg());
5642     OffsetOpc = OffsetInst->getOpcode();
5643     LookedThroughZExt = true;
5644 
5645     if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
5646       return None;
5647   }
5648   // Make sure that the memory op is a valid size.
5649   int64_t LegalShiftVal = Log2_32(SizeInBytes);
5650   if (LegalShiftVal == 0)
5651     return None;
5652   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5653     return None;
5654 
5655   // Now, try to find the specific G_CONSTANT. Start by assuming that the
5656   // register we will offset is the LHS, and the register containing the
5657   // constant is the RHS.
5658   Register OffsetReg = OffsetInst->getOperand(1).getReg();
5659   Register ConstantReg = OffsetInst->getOperand(2).getReg();
5660   auto ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
5661   if (!ValAndVReg) {
5662     // We didn't get a constant on the RHS. If the opcode is a shift, then
5663     // we're done.
5664     if (OffsetOpc == TargetOpcode::G_SHL)
5665       return None;
5666 
5667     // If we have a G_MUL, we can use either register. Try looking at the RHS.
5668     std::swap(OffsetReg, ConstantReg);
5669     ValAndVReg = getIConstantVRegValWithLookThrough(ConstantReg, MRI);
5670     if (!ValAndVReg)
5671       return None;
5672   }
5673 
5674   // The value must fit into 3 bits, and must be positive. Make sure that is
5675   // true.
5676   int64_t ImmVal = ValAndVReg->Value.getSExtValue();
5677 
5678   // Since we're going to pull this into a shift, the constant value must be
5679   // a power of 2. If we got a multiply, then we need to check this.
5680   if (OffsetOpc == TargetOpcode::G_MUL) {
5681     if (!isPowerOf2_32(ImmVal))
5682       return None;
5683 
5684     // Got a power of 2. So, the amount we'll shift is the log base-2 of that.
5685     ImmVal = Log2_32(ImmVal);
5686   }
5687 
5688   if ((ImmVal & 0x7) != ImmVal)
5689     return None;
5690 
5691   // We are only allowed to shift by LegalShiftVal. This shift value is built
5692   // into the instruction, so we can't just use whatever we want.
5693   if (ImmVal != LegalShiftVal)
5694     return None;
5695 
5696   unsigned SignExtend = 0;
5697   if (WantsExt) {
5698     // Check if the offset is defined by an extend, unless we looked through a
5699     // G_ZEXT earlier.
5700     if (!LookedThroughZExt) {
5701       MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
5702       auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
5703       if (Ext == AArch64_AM::InvalidShiftExtend)
5704         return None;
5705 
5706       SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
5707       // We only support SXTW for signed extension here.
5708       if (SignExtend && Ext != AArch64_AM::SXTW)
5709         return None;
5710       OffsetReg = ExtInst->getOperand(1).getReg();
5711     }
5712 
5713     // Need a 32-bit wide register here.
5714     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
5715     OffsetReg = moveScalarRegClass(OffsetReg, AArch64::GPR32RegClass, MIB);
5716   }
5717 
5718   // We can use the LHS of the GEP as the base, and the LHS of the shift as an
5719   // offset. Signify that we are shifting by setting the shift flag to 1.
5720   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
5721            [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
5722            [=](MachineInstrBuilder &MIB) {
5723              // Need to add both immediates here to make sure that they are both
5724              // added to the instruction.
5725              MIB.addImm(SignExtend);
5726              MIB.addImm(1);
5727            }}};
5728 }
5729 
5730 /// This is used for computing addresses like this:
5731 ///
5732 /// ldr x1, [x2, x3, lsl #3]
5733 ///
5734 /// Where x2 is the base register, and x3 is an offset register. The shift-left
5735 /// is a constant value specific to this load instruction. That is, we'll never
5736 /// see anything other than a 3 here (which corresponds to the size of the
5737 /// element being loaded.)
5738 InstructionSelector::ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand & Root,unsigned SizeInBytes) const5739 AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
5740     MachineOperand &Root, unsigned SizeInBytes) const {
5741   if (!Root.isReg())
5742     return None;
5743   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5744 
5745   // We want to find something like this:
5746   //
5747   // val = G_CONSTANT LegalShiftVal
5748   // shift = G_SHL off_reg val
5749   // ptr = G_PTR_ADD base_reg shift
5750   // x = G_LOAD ptr
5751   //
5752   // And fold it into this addressing mode:
5753   //
5754   // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
5755 
5756   // Check if we can find the G_PTR_ADD.
5757   MachineInstr *PtrAdd =
5758       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5759   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5760     return None;
5761 
5762   // Now, try to match an opcode which will match our specific offset.
5763   // We want a G_SHL or a G_MUL.
5764   MachineInstr *OffsetInst =
5765       getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
5766   return selectExtendedSHL(Root, PtrAdd->getOperand(1),
5767                            OffsetInst->getOperand(0), SizeInBytes,
5768                            /*WantsExt=*/false);
5769 }
5770 
5771 /// This is used for computing addresses like this:
5772 ///
5773 /// ldr x1, [x2, x3]
5774 ///
5775 /// Where x2 is the base register, and x3 is an offset register.
5776 ///
5777 /// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
5778 /// this will do so. Otherwise, it will return None.
5779 InstructionSelector::ComplexRendererFns
selectAddrModeRegisterOffset(MachineOperand & Root) const5780 AArch64InstructionSelector::selectAddrModeRegisterOffset(
5781     MachineOperand &Root) const {
5782   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5783 
5784   // We need a GEP.
5785   MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
5786   if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
5787     return None;
5788 
5789   // If this is used more than once, let's not bother folding.
5790   // TODO: Check if they are memory ops. If they are, then we can still fold
5791   // without having to recompute anything.
5792   if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
5793     return None;
5794 
5795   // Base is the GEP's LHS, offset is its RHS.
5796   return {{[=](MachineInstrBuilder &MIB) {
5797              MIB.addUse(Gep->getOperand(1).getReg());
5798            },
5799            [=](MachineInstrBuilder &MIB) {
5800              MIB.addUse(Gep->getOperand(2).getReg());
5801            },
5802            [=](MachineInstrBuilder &MIB) {
5803              // Need to add both immediates here to make sure that they are both
5804              // added to the instruction.
5805              MIB.addImm(0);
5806              MIB.addImm(0);
5807            }}};
5808 }
5809 
5810 /// This is intended to be equivalent to selectAddrModeXRO in
5811 /// AArch64ISelDAGtoDAG. It's used for selecting X register offset loads.
5812 InstructionSelector::ComplexRendererFns
selectAddrModeXRO(MachineOperand & Root,unsigned SizeInBytes) const5813 AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
5814                                               unsigned SizeInBytes) const {
5815   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5816   if (!Root.isReg())
5817     return None;
5818   MachineInstr *PtrAdd =
5819       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5820   if (!PtrAdd)
5821     return None;
5822 
5823   // Check for an immediates which cannot be encoded in the [base + imm]
5824   // addressing mode, and can't be encoded in an add/sub. If this happens, we'll
5825   // end up with code like:
5826   //
5827   // mov x0, wide
5828   // add x1 base, x0
5829   // ldr x2, [x1, x0]
5830   //
5831   // In this situation, we can use the [base, xreg] addressing mode to save an
5832   // add/sub:
5833   //
5834   // mov x0, wide
5835   // ldr x2, [base, x0]
5836   auto ValAndVReg =
5837       getIConstantVRegValWithLookThrough(PtrAdd->getOperand(2).getReg(), MRI);
5838   if (ValAndVReg) {
5839     unsigned Scale = Log2_32(SizeInBytes);
5840     int64_t ImmOff = ValAndVReg->Value.getSExtValue();
5841 
5842     // Skip immediates that can be selected in the load/store addresing
5843     // mode.
5844     if (ImmOff % SizeInBytes == 0 && ImmOff >= 0 &&
5845         ImmOff < (0x1000 << Scale))
5846       return None;
5847 
5848     // Helper lambda to decide whether or not it is preferable to emit an add.
5849     auto isPreferredADD = [](int64_t ImmOff) {
5850       // Constants in [0x0, 0xfff] can be encoded in an add.
5851       if ((ImmOff & 0xfffffffffffff000LL) == 0x0LL)
5852         return true;
5853 
5854       // Can it be encoded in an add lsl #12?
5855       if ((ImmOff & 0xffffffffff000fffLL) != 0x0LL)
5856         return false;
5857 
5858       // It can be encoded in an add lsl #12, but we may not want to. If it is
5859       // possible to select this as a single movz, then prefer that. A single
5860       // movz is faster than an add with a shift.
5861       return (ImmOff & 0xffffffffff00ffffLL) != 0x0LL &&
5862              (ImmOff & 0xffffffffffff0fffLL) != 0x0LL;
5863     };
5864 
5865     // If the immediate can be encoded in a single add/sub, then bail out.
5866     if (isPreferredADD(ImmOff) || isPreferredADD(-ImmOff))
5867       return None;
5868   }
5869 
5870   // Try to fold shifts into the addressing mode.
5871   auto AddrModeFns = selectAddrModeShiftedExtendXReg(Root, SizeInBytes);
5872   if (AddrModeFns)
5873     return AddrModeFns;
5874 
5875   // If that doesn't work, see if it's possible to fold in registers from
5876   // a GEP.
5877   return selectAddrModeRegisterOffset(Root);
5878 }
5879 
5880 /// This is used for computing addresses like this:
5881 ///
5882 /// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
5883 ///
5884 /// Where we have a 64-bit base register, a 32-bit offset register, and an
5885 /// extend (which may or may not be signed).
5886 InstructionSelector::ComplexRendererFns
selectAddrModeWRO(MachineOperand & Root,unsigned SizeInBytes) const5887 AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
5888                                               unsigned SizeInBytes) const {
5889   MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
5890 
5891   MachineInstr *PtrAdd =
5892       getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
5893   if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
5894     return None;
5895 
5896   MachineOperand &LHS = PtrAdd->getOperand(1);
5897   MachineOperand &RHS = PtrAdd->getOperand(2);
5898   MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
5899 
5900   // The first case is the same as selectAddrModeXRO, except we need an extend.
5901   // In this case, we try to find a shift and extend, and fold them into the
5902   // addressing mode.
5903   //
5904   // E.g.
5905   //
5906   // off_reg = G_Z/S/ANYEXT ext_reg
5907   // val = G_CONSTANT LegalShiftVal
5908   // shift = G_SHL off_reg val
5909   // ptr = G_PTR_ADD base_reg shift
5910   // x = G_LOAD ptr
5911   //
5912   // In this case we can get a load like this:
5913   //
5914   // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
5915   auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
5916                                        SizeInBytes, /*WantsExt=*/true);
5917   if (ExtendedShl)
5918     return ExtendedShl;
5919 
5920   // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
5921   //
5922   // e.g.
5923   // ldr something, [base_reg, ext_reg, sxtw]
5924   if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
5925     return None;
5926 
5927   // Check if this is an extend. We'll get an extend type if it is.
5928   AArch64_AM::ShiftExtendType Ext =
5929       getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
5930   if (Ext == AArch64_AM::InvalidShiftExtend)
5931     return None;
5932 
5933   // Need a 32-bit wide register.
5934   MachineIRBuilder MIB(*PtrAdd);
5935   Register ExtReg = moveScalarRegClass(OffsetInst->getOperand(1).getReg(),
5936                                        AArch64::GPR32RegClass, MIB);
5937   unsigned SignExtend = Ext == AArch64_AM::SXTW;
5938 
5939   // Base is LHS, offset is ExtReg.
5940   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
5941            [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
5942            [=](MachineInstrBuilder &MIB) {
5943              MIB.addImm(SignExtend);
5944              MIB.addImm(0);
5945            }}};
5946 }
5947 
5948 /// Select a "register plus unscaled signed 9-bit immediate" address.  This
5949 /// should only match when there is an offset that is not valid for a scaled
5950 /// immediate addressing mode.  The "Size" argument is the size in bytes of the
5951 /// memory reference, which is needed here to know what is valid for a scaled
5952 /// immediate.
5953 InstructionSelector::ComplexRendererFns
selectAddrModeUnscaled(MachineOperand & Root,unsigned Size) const5954 AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
5955                                                    unsigned Size) const {
5956   MachineRegisterInfo &MRI =
5957       Root.getParent()->getParent()->getParent()->getRegInfo();
5958 
5959   if (!Root.isReg())
5960     return None;
5961 
5962   if (!isBaseWithConstantOffset(Root, MRI))
5963     return None;
5964 
5965   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
5966   if (!RootDef)
5967     return None;
5968 
5969   MachineOperand &OffImm = RootDef->getOperand(2);
5970   if (!OffImm.isReg())
5971     return None;
5972   MachineInstr *RHS = MRI.getVRegDef(OffImm.getReg());
5973   if (!RHS || RHS->getOpcode() != TargetOpcode::G_CONSTANT)
5974     return None;
5975   int64_t RHSC;
5976   MachineOperand &RHSOp1 = RHS->getOperand(1);
5977   if (!RHSOp1.isCImm() || RHSOp1.getCImm()->getBitWidth() > 64)
5978     return None;
5979   RHSC = RHSOp1.getCImm()->getSExtValue();
5980 
5981   // If the offset is valid as a scaled immediate, don't match here.
5982   if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Log2_32(Size)))
5983     return None;
5984   if (RHSC >= -256 && RHSC < 256) {
5985     MachineOperand &Base = RootDef->getOperand(1);
5986     return {{
5987         [=](MachineInstrBuilder &MIB) { MIB.add(Base); },
5988         [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC); },
5989     }};
5990   }
5991   return None;
5992 }
5993 
5994 InstructionSelector::ComplexRendererFns
tryFoldAddLowIntoImm(MachineInstr & RootDef,unsigned Size,MachineRegisterInfo & MRI) const5995 AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
5996                                                  unsigned Size,
5997                                                  MachineRegisterInfo &MRI) const {
5998   if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
5999     return None;
6000   MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
6001   if (Adrp.getOpcode() != AArch64::ADRP)
6002     return None;
6003 
6004   // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
6005   auto Offset = Adrp.getOperand(1).getOffset();
6006   if (Offset % Size != 0)
6007     return None;
6008 
6009   auto GV = Adrp.getOperand(1).getGlobal();
6010   if (GV->isThreadLocal())
6011     return None;
6012 
6013   auto &MF = *RootDef.getParent()->getParent();
6014   if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
6015     return None;
6016 
6017   unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
6018   MachineIRBuilder MIRBuilder(RootDef);
6019   Register AdrpReg = Adrp.getOperand(0).getReg();
6020   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
6021            [=](MachineInstrBuilder &MIB) {
6022              MIB.addGlobalAddress(GV, Offset,
6023                                   OpFlags | AArch64II::MO_PAGEOFF |
6024                                       AArch64II::MO_NC);
6025            }}};
6026 }
6027 
6028 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
6029 /// "Size" argument is the size in bytes of the memory reference, which
6030 /// determines the scale.
6031 InstructionSelector::ComplexRendererFns
selectAddrModeIndexed(MachineOperand & Root,unsigned Size) const6032 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
6033                                                   unsigned Size) const {
6034   MachineFunction &MF = *Root.getParent()->getParent()->getParent();
6035   MachineRegisterInfo &MRI = MF.getRegInfo();
6036 
6037   if (!Root.isReg())
6038     return None;
6039 
6040   MachineInstr *RootDef = MRI.getVRegDef(Root.getReg());
6041   if (!RootDef)
6042     return None;
6043 
6044   if (RootDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
6045     return {{
6046         [=](MachineInstrBuilder &MIB) { MIB.add(RootDef->getOperand(1)); },
6047         [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6048     }};
6049   }
6050 
6051   CodeModel::Model CM = MF.getTarget().getCodeModel();
6052   // Check if we can fold in the ADD of small code model ADRP + ADD address.
6053   if (CM == CodeModel::Small) {
6054     auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
6055     if (OpFns)
6056       return OpFns;
6057   }
6058 
6059   if (isBaseWithConstantOffset(Root, MRI)) {
6060     MachineOperand &LHS = RootDef->getOperand(1);
6061     MachineOperand &RHS = RootDef->getOperand(2);
6062     MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
6063     MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
6064     if (LHSDef && RHSDef) {
6065       int64_t RHSC = (int64_t)RHSDef->getOperand(1).getCImm()->getZExtValue();
6066       unsigned Scale = Log2_32(Size);
6067       if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
6068         if (LHSDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
6069           return {{
6070               [=](MachineInstrBuilder &MIB) { MIB.add(LHSDef->getOperand(1)); },
6071               [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6072           }};
6073 
6074         return {{
6075             [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
6076             [=](MachineInstrBuilder &MIB) { MIB.addImm(RHSC >> Scale); },
6077         }};
6078       }
6079     }
6080   }
6081 
6082   // Before falling back to our general case, check if the unscaled
6083   // instructions can handle this. If so, that's preferable.
6084   if (selectAddrModeUnscaled(Root, Size).hasValue())
6085     return None;
6086 
6087   return {{
6088       [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
6089       [=](MachineInstrBuilder &MIB) { MIB.addImm(0); },
6090   }};
6091 }
6092 
6093 /// Given a shift instruction, return the correct shift type for that
6094 /// instruction.
getShiftTypeForInst(MachineInstr & MI)6095 static AArch64_AM::ShiftExtendType getShiftTypeForInst(MachineInstr &MI) {
6096   switch (MI.getOpcode()) {
6097   default:
6098     return AArch64_AM::InvalidShiftExtend;
6099   case TargetOpcode::G_SHL:
6100     return AArch64_AM::LSL;
6101   case TargetOpcode::G_LSHR:
6102     return AArch64_AM::LSR;
6103   case TargetOpcode::G_ASHR:
6104     return AArch64_AM::ASR;
6105   case TargetOpcode::G_ROTR:
6106     return AArch64_AM::ROR;
6107   }
6108 }
6109 
6110 /// Select a "shifted register" operand. If the value is not shifted, set the
6111 /// shift operand to a default value of "lsl 0".
6112 InstructionSelector::ComplexRendererFns
selectShiftedRegister(MachineOperand & Root,bool AllowROR) const6113 AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root,
6114                                                   bool AllowROR) const {
6115   if (!Root.isReg())
6116     return None;
6117   MachineRegisterInfo &MRI =
6118       Root.getParent()->getParent()->getParent()->getRegInfo();
6119 
6120   // Check if the operand is defined by an instruction which corresponds to
6121   // a ShiftExtendType. E.g. a G_SHL, G_LSHR, etc.
6122   MachineInstr *ShiftInst = MRI.getVRegDef(Root.getReg());
6123   if (!ShiftInst)
6124     return None;
6125   AArch64_AM::ShiftExtendType ShType = getShiftTypeForInst(*ShiftInst);
6126   if (ShType == AArch64_AM::InvalidShiftExtend)
6127     return None;
6128   if (ShType == AArch64_AM::ROR && !AllowROR)
6129     return None;
6130   if (!isWorthFoldingIntoExtendedReg(*ShiftInst, MRI))
6131     return None;
6132 
6133   // Need an immediate on the RHS.
6134   MachineOperand &ShiftRHS = ShiftInst->getOperand(2);
6135   auto Immed = getImmedFromMO(ShiftRHS);
6136   if (!Immed)
6137     return None;
6138 
6139   // We have something that we can fold. Fold in the shift's LHS and RHS into
6140   // the instruction.
6141   MachineOperand &ShiftLHS = ShiftInst->getOperand(1);
6142   Register ShiftReg = ShiftLHS.getReg();
6143 
6144   unsigned NumBits = MRI.getType(ShiftReg).getSizeInBits();
6145   unsigned Val = *Immed & (NumBits - 1);
6146   unsigned ShiftVal = AArch64_AM::getShifterImm(ShType, Val);
6147 
6148   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ShiftReg); },
6149            [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
6150 }
6151 
getExtendTypeForInst(MachineInstr & MI,MachineRegisterInfo & MRI,bool IsLoadStore) const6152 AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
6153     MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
6154   unsigned Opc = MI.getOpcode();
6155 
6156   // Handle explicit extend instructions first.
6157   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
6158     unsigned Size;
6159     if (Opc == TargetOpcode::G_SEXT)
6160       Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6161     else
6162       Size = MI.getOperand(2).getImm();
6163     assert(Size != 64 && "Extend from 64 bits?");
6164     switch (Size) {
6165     case 8:
6166       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTB;
6167     case 16:
6168       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::SXTH;
6169     case 32:
6170       return AArch64_AM::SXTW;
6171     default:
6172       return AArch64_AM::InvalidShiftExtend;
6173     }
6174   }
6175 
6176   if (Opc == TargetOpcode::G_ZEXT || Opc == TargetOpcode::G_ANYEXT) {
6177     unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
6178     assert(Size != 64 && "Extend from 64 bits?");
6179     switch (Size) {
6180     case 8:
6181       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTB;
6182     case 16:
6183       return IsLoadStore ? AArch64_AM::InvalidShiftExtend : AArch64_AM::UXTH;
6184     case 32:
6185       return AArch64_AM::UXTW;
6186     default:
6187       return AArch64_AM::InvalidShiftExtend;
6188     }
6189   }
6190 
6191   // Don't have an explicit extend. Try to handle a G_AND with a constant mask
6192   // on the RHS.
6193   if (Opc != TargetOpcode::G_AND)
6194     return AArch64_AM::InvalidShiftExtend;
6195 
6196   Optional<uint64_t> MaybeAndMask = getImmedFromMO(MI.getOperand(2));
6197   if (!MaybeAndMask)
6198     return AArch64_AM::InvalidShiftExtend;
6199   uint64_t AndMask = *MaybeAndMask;
6200   switch (AndMask) {
6201   default:
6202     return AArch64_AM::InvalidShiftExtend;
6203   case 0xFF:
6204     return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
6205   case 0xFFFF:
6206     return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
6207   case 0xFFFFFFFF:
6208     return AArch64_AM::UXTW;
6209   }
6210 }
6211 
moveScalarRegClass(Register Reg,const TargetRegisterClass & RC,MachineIRBuilder & MIB) const6212 Register AArch64InstructionSelector::moveScalarRegClass(
6213     Register Reg, const TargetRegisterClass &RC, MachineIRBuilder &MIB) const {
6214   MachineRegisterInfo &MRI = *MIB.getMRI();
6215   auto Ty = MRI.getType(Reg);
6216   assert(!Ty.isVector() && "Expected scalars only!");
6217   if (Ty.getSizeInBits() == TRI.getRegSizeInBits(RC))
6218     return Reg;
6219 
6220   // Create a copy and immediately select it.
6221   // FIXME: We should have an emitCopy function?
6222   auto Copy = MIB.buildCopy({&RC}, {Reg});
6223   selectCopy(*Copy, TII, MRI, TRI, RBI);
6224   return Copy.getReg(0);
6225 }
6226 
6227 /// Select an "extended register" operand. This operand folds in an extend
6228 /// followed by an optional left shift.
6229 InstructionSelector::ComplexRendererFns
selectArithExtendedRegister(MachineOperand & Root) const6230 AArch64InstructionSelector::selectArithExtendedRegister(
6231     MachineOperand &Root) const {
6232   if (!Root.isReg())
6233     return None;
6234   MachineRegisterInfo &MRI =
6235       Root.getParent()->getParent()->getParent()->getRegInfo();
6236 
6237   uint64_t ShiftVal = 0;
6238   Register ExtReg;
6239   AArch64_AM::ShiftExtendType Ext;
6240   MachineInstr *RootDef = getDefIgnoringCopies(Root.getReg(), MRI);
6241   if (!RootDef)
6242     return None;
6243 
6244   if (!isWorthFoldingIntoExtendedReg(*RootDef, MRI))
6245     return None;
6246 
6247   // Check if we can fold a shift and an extend.
6248   if (RootDef->getOpcode() == TargetOpcode::G_SHL) {
6249     // Look for a constant on the RHS of the shift.
6250     MachineOperand &RHS = RootDef->getOperand(2);
6251     Optional<uint64_t> MaybeShiftVal = getImmedFromMO(RHS);
6252     if (!MaybeShiftVal)
6253       return None;
6254     ShiftVal = *MaybeShiftVal;
6255     if (ShiftVal > 4)
6256       return None;
6257     // Look for a valid extend instruction on the LHS of the shift.
6258     MachineOperand &LHS = RootDef->getOperand(1);
6259     MachineInstr *ExtDef = getDefIgnoringCopies(LHS.getReg(), MRI);
6260     if (!ExtDef)
6261       return None;
6262     Ext = getExtendTypeForInst(*ExtDef, MRI);
6263     if (Ext == AArch64_AM::InvalidShiftExtend)
6264       return None;
6265     ExtReg = ExtDef->getOperand(1).getReg();
6266   } else {
6267     // Didn't get a shift. Try just folding an extend.
6268     Ext = getExtendTypeForInst(*RootDef, MRI);
6269     if (Ext == AArch64_AM::InvalidShiftExtend)
6270       return None;
6271     ExtReg = RootDef->getOperand(1).getReg();
6272 
6273     // If we have a 32 bit instruction which zeroes out the high half of a
6274     // register, we get an implicit zero extend for free. Check if we have one.
6275     // FIXME: We actually emit the extend right now even though we don't have
6276     // to.
6277     if (Ext == AArch64_AM::UXTW && MRI.getType(ExtReg).getSizeInBits() == 32) {
6278       MachineInstr *ExtInst = MRI.getVRegDef(ExtReg);
6279       if (ExtInst && isDef32(*ExtInst))
6280         return None;
6281     }
6282   }
6283 
6284   // We require a GPR32 here. Narrow the ExtReg if needed using a subregister
6285   // copy.
6286   MachineIRBuilder MIB(*RootDef);
6287   ExtReg = moveScalarRegClass(ExtReg, AArch64::GPR32RegClass, MIB);
6288 
6289   return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
6290            [=](MachineInstrBuilder &MIB) {
6291              MIB.addImm(getArithExtendImm(Ext, ShiftVal));
6292            }}};
6293 }
6294 
renderTruncImm(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6295 void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
6296                                                 const MachineInstr &MI,
6297                                                 int OpIdx) const {
6298   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6299   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6300          "Expected G_CONSTANT");
6301   Optional<int64_t> CstVal =
6302       getIConstantVRegSExtVal(MI.getOperand(0).getReg(), MRI);
6303   assert(CstVal && "Expected constant value");
6304   MIB.addImm(CstVal.getValue());
6305 }
6306 
renderLogicalImm32(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6307 void AArch64InstructionSelector::renderLogicalImm32(
6308   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6309   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6310          "Expected G_CONSTANT");
6311   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6312   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
6313   MIB.addImm(Enc);
6314 }
6315 
renderLogicalImm64(MachineInstrBuilder & MIB,const MachineInstr & I,int OpIdx) const6316 void AArch64InstructionSelector::renderLogicalImm64(
6317   MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
6318   assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
6319          "Expected G_CONSTANT");
6320   uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
6321   uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
6322   MIB.addImm(Enc);
6323 }
6324 
renderFPImm16(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6325 void AArch64InstructionSelector::renderFPImm16(MachineInstrBuilder &MIB,
6326                                                const MachineInstr &MI,
6327                                                int OpIdx) const {
6328   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6329          "Expected G_FCONSTANT");
6330   MIB.addImm(
6331       AArch64_AM::getFP16Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6332 }
6333 
renderFPImm32(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6334 void AArch64InstructionSelector::renderFPImm32(MachineInstrBuilder &MIB,
6335                                                const MachineInstr &MI,
6336                                                int OpIdx) const {
6337   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6338          "Expected G_FCONSTANT");
6339   MIB.addImm(
6340       AArch64_AM::getFP32Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6341 }
6342 
renderFPImm64(MachineInstrBuilder & MIB,const MachineInstr & MI,int OpIdx) const6343 void AArch64InstructionSelector::renderFPImm64(MachineInstrBuilder &MIB,
6344                                                const MachineInstr &MI,
6345                                                int OpIdx) const {
6346   assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT && OpIdx == -1 &&
6347          "Expected G_FCONSTANT");
6348   MIB.addImm(
6349       AArch64_AM::getFP64Imm(MI.getOperand(1).getFPImm()->getValueAPF()));
6350 }
6351 
isLoadStoreOfNumBytes(const MachineInstr & MI,unsigned NumBytes) const6352 bool AArch64InstructionSelector::isLoadStoreOfNumBytes(
6353     const MachineInstr &MI, unsigned NumBytes) const {
6354   if (!MI.mayLoadOrStore())
6355     return false;
6356   assert(MI.hasOneMemOperand() &&
6357          "Expected load/store to have only one mem op!");
6358   return (*MI.memoperands_begin())->getSize() == NumBytes;
6359 }
6360 
isDef32(const MachineInstr & MI) const6361 bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
6362   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
6363   if (MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() != 32)
6364     return false;
6365 
6366   // Only return true if we know the operation will zero-out the high half of
6367   // the 64-bit register. Truncates can be subregister copies, which don't
6368   // zero out the high bits. Copies and other copy-like instructions can be
6369   // fed by truncates, or could be lowered as subregister copies.
6370   switch (MI.getOpcode()) {
6371   default:
6372     return true;
6373   case TargetOpcode::COPY:
6374   case TargetOpcode::G_BITCAST:
6375   case TargetOpcode::G_TRUNC:
6376   case TargetOpcode::G_PHI:
6377     return false;
6378   }
6379 }
6380 
6381 
6382 // Perform fixups on the given PHI instruction's operands to force them all
6383 // to be the same as the destination regbank.
fixupPHIOpBanks(MachineInstr & MI,MachineRegisterInfo & MRI,const AArch64RegisterBankInfo & RBI)6384 static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
6385                             const AArch64RegisterBankInfo &RBI) {
6386   assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
6387   Register DstReg = MI.getOperand(0).getReg();
6388   const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
6389   assert(DstRB && "Expected PHI dst to have regbank assigned");
6390   MachineIRBuilder MIB(MI);
6391 
6392   // Go through each operand and ensure it has the same regbank.
6393   for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
6394     MachineOperand &MO = MI.getOperand(OpIdx);
6395     if (!MO.isReg())
6396       continue;
6397     Register OpReg = MO.getReg();
6398     const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
6399     if (RB != DstRB) {
6400       // Insert a cross-bank copy.
6401       auto *OpDef = MRI.getVRegDef(OpReg);
6402       const LLT &Ty = MRI.getType(OpReg);
6403       MachineBasicBlock &OpDefBB = *OpDef->getParent();
6404 
6405       // Any instruction we insert must appear after all PHIs in the block
6406       // for the block to be valid MIR.
6407       MachineBasicBlock::iterator InsertPt = std::next(OpDef->getIterator());
6408       if (InsertPt != OpDefBB.end() && InsertPt->isPHI())
6409         InsertPt = OpDefBB.getFirstNonPHI();
6410       MIB.setInsertPt(*OpDef->getParent(), InsertPt);
6411       auto Copy = MIB.buildCopy(Ty, OpReg);
6412       MRI.setRegBank(Copy.getReg(0), *DstRB);
6413       MO.setReg(Copy.getReg(0));
6414     }
6415   }
6416 }
6417 
processPHIs(MachineFunction & MF)6418 void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
6419   // We're looking for PHIs, build a list so we don't invalidate iterators.
6420   MachineRegisterInfo &MRI = MF.getRegInfo();
6421   SmallVector<MachineInstr *, 32> Phis;
6422   for (auto &BB : MF) {
6423     for (auto &MI : BB) {
6424       if (MI.getOpcode() == TargetOpcode::G_PHI)
6425         Phis.emplace_back(&MI);
6426     }
6427   }
6428 
6429   for (auto *MI : Phis) {
6430     // We need to do some work here if the operand types are < 16 bit and they
6431     // are split across fpr/gpr banks. Since all types <32b on gpr
6432     // end up being assigned gpr32 regclasses, we can end up with PHIs here
6433     // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
6434     // be selecting heterogenous regbanks for operands if possible, but we
6435     // still need to be able to deal with it here.
6436     //
6437     // To fix this, if we have a gpr-bank operand < 32b in size and at least
6438     // one other operand is on the fpr bank, then we add cross-bank copies
6439     // to homogenize the operand banks. For simplicity the bank that we choose
6440     // to settle on is whatever bank the def operand has. For example:
6441     //
6442     // %endbb:
6443     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
6444     //  =>
6445     // %bb2:
6446     //   ...
6447     //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
6448     //   ...
6449     // %endbb:
6450     //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
6451     bool HasGPROp = false, HasFPROp = false;
6452     for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
6453       const auto &MO = MI->getOperand(OpIdx);
6454       if (!MO.isReg())
6455         continue;
6456       const LLT &Ty = MRI.getType(MO.getReg());
6457       if (!Ty.isValid() || !Ty.isScalar())
6458         break;
6459       if (Ty.getSizeInBits() >= 32)
6460         break;
6461       const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
6462       // If for some reason we don't have a regbank yet. Don't try anything.
6463       if (!RB)
6464         break;
6465 
6466       if (RB->getID() == AArch64::GPRRegBankID)
6467         HasGPROp = true;
6468       else
6469         HasFPROp = true;
6470     }
6471     // We have heterogenous regbanks, need to fixup.
6472     if (HasGPROp && HasFPROp)
6473       fixupPHIOpBanks(*MI, MRI, RBI);
6474   }
6475 }
6476 
6477 namespace llvm {
6478 InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine & TM,AArch64Subtarget & Subtarget,AArch64RegisterBankInfo & RBI)6479 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
6480                                  AArch64Subtarget &Subtarget,
6481                                  AArch64RegisterBankInfo &RBI) {
6482   return new AArch64InstructionSelector(TM, Subtarget, RBI);
6483 }
6484 }
6485